Supported Functions
using TidierData
using TidierCats
using Random
Random.seed!(10)
categories = ["High", "Medium", "Low", "Zilch"]
random_indices = rand(1:length(categories), 57)
df = DataFrame(
ID = 1:57,
CatVar = categorical([categories[i] for i in random_indices], levels = categories)
)
first(df, 5)
Row | ID | CatVar |
---|---|---|
Int64 | Cat… | |
1 | 1 | Medium |
2 | 2 | High |
3 | 3 | High |
4 | 4 | High |
5 | 5 | Medium |
cat_relevel()
¤
This function changes the order of levels in a categorical variable. It accepts two arguments - a column name and an array of levels in the desired order.
custom_order = @chain df begin
@mutate(CatVar = cat_relevel(CatVar, ["Zilch", "Medium", "High", "Low"]))
end
print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]
and
print(levels(custom_order.CatVar))
["Zilch", "Medium", "High", "Low"]
cat_rev()
¤
This function reverses the order of levels in a categorical variable. It only requires one argument - the column name whose levels are to be reversed
reversed_order = @chain df begin
@mutate(CatVar = cat_rev(CatVar))
end
print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]
and
print(levels(reversed_order.CatVar))
["Zilch", "Low", "Medium", "High"]
cat_infreq()
¤
This function reorders levels of a categorical variable based on their frequencies, with most frequent level first. The single argument is column name
@chain df begin
@count(CatVar)
end
orderedbyfrequency = @chain df begin
@mutate(CatVar = cat_infreq(CatVar))
end
print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]
and
print(levels(orderedbyfrequency.CatVar))
@chain df begin
@count(CatVar)
end
Row | CatVar | n |
---|---|---|
Cat… | Int64 | |
1 | Medium | 11 |
2 | High | 19 |
3 | Low | 14 |
4 | Zilch | 13 |
cat_lump()
¤
This function lumps the least frequent levels into a new "Other" level. It accepts two arguments - a column name and an integer specifying the number of levels to keep.
lumped_cats = @chain df begin
@mutate(CatVar = cat_lump(CatVar,2))
end
print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]
and
print(levels(lumped_cats.CatVar))
@chain lumped_cats begin
@count(CatVar)
end
Row | CatVar | n |
---|---|---|
Cat… | Int64 | |
1 | Other | 24 |
2 | High | 19 |
3 | Low | 14 |
cat_reorder()
¤
This function reorders levels of a categorical variable based on a mean of a second variable. It takes three arguments - a categorical column , a numerical column by which to reorder, and a function to calculate the summary statistic (currently only supports mean, median). There is a fourth optional argument which defaults to true, if set to false, it order the categories in ascending order.
df3 = DataFrame(
cat_var = repeat(["Low", "Medium", "High"], outer = 10),
order_var = rand(30)
)
df4 = @chain df3 begin
@mutate(cat_var= cat_reorder(cat_var, order_var, "median" ))
end
print(levels(df3.cat_var))
["High", "Low", "Medium"]
and
print(levels(df4.cat_var))
@chain df3 begin
@mutate(catty = as_categorical(cat_var))
@group_by(catty)
end
GroupedDataFrame with 3 groups based on key: catty
Row | cat_var | order_var | catty |
---|---|---|---|
String | Float64 | Cat… | |
1 | Low | 0.0904901 | Low |
2 | Low | 0.474166 | Low |
3 | Low | 0.649207 | Low |
4 | Low | 0.813709 | Low |
5 | Low | 0.372275 | Low |
6 | Low | 0.434213 | Low |
7 | Low | 0.554675 | Low |
8 | Low | 0.120808 | Low |
9 | Low | 0.520476 | Low |
10 | Low | 0.828585 | Low |
⋮
Row | cat_var | order_var | catty |
---|---|---|---|
String | Float64 | Cat… | |
1 | High | 0.0557954 | High |
2 | High | 0.452822 | High |
3 | High | 0.392587 | High |
4 | High | 0.646916 | High |
5 | High | 0.904497 | High |
6 | High | 0.827882 | High |
7 | High | 0.0906015 | High |
8 | High | 0.683657 | High |
9 | High | 0.389327 | High |
10 | High | 0.692575 | High |
cat_collapse()
¤
This function collapses levels in a categorical variable according to a specified mapping. It requires two arguments - a categorical column and a dictionary that maps original levels to new ones.
df5 = @chain df begin
@mutate(CatVar = cat_collapse(CatVar, Dict("Low" => "bad", "Zilch" => "bad")))
end
print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]
and
print(levels(df5.CatVar))
["High", "Medium", "bad"]
as_categorical()
¤
This function converts a standard Julia array to a categorical array. The only argument it needs is the colunn name to be converted.
test = DataFrame( w = ["A", "B", "C", "D"])
@chain test begin
@mutate(w = as_categorical(w))
end
Row | w |
---|---|
Cat… | |
1 | A |
2 | B |
3 | C |
4 | D |
cat_lump_min()
¤
This function wil lump any cargory with less than the minimum number of entries and recateogrize it as "Other" as the default, or a category name chosen by the user
@chain df begin
@count(CatVar)
end
lumpedbymin = @chain df begin
@mutate(CatVar = cat_lump_min(CatVar, 14))
end
print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]
and
print(levels(lumpedbymin.CatVar))
["High", "Low", "Other"]
cat_lump_min()
¤
This function wil lump any cargory with less than the minimum proportion and recateogrize it as "Other" as the default, or a category name chosen by the user
lumpedbyprop = @chain df begin
@mutate(CatVar = cat_lump_prop(CatVar, .25, "wow"))
end
print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]
and
print(levels(lumpedbyprop.CatVar))
["High", "wow"]
This page was generated using Literate.jl.