Skip to content

Supported Functions

using TidierData
using TidierCats
using Random

Random.seed!(10)

categories = ["High", "Medium", "Low", "Zilch"]

random_indices = rand(1:length(categories), 57)

df = DataFrame(
    ID = 1:57,
    CatVar = categorical([categories[i] for i in random_indices], levels = categories)
)
first(df, 5)
5×2 DataFrame
RowIDCatVar
Int64Cat…
11Medium
22High
33High
44High
55Medium

cat_relevel()¤

This function changes the order of levels in a categorical variable. It accepts two arguments - a column name and an array of levels in the desired order.

custom_order = @chain df begin
    @mutate(CatVar = cat_relevel(CatVar, ["Zilch", "Medium", "High", "Low"]))
end

print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]

and

print(levels(custom_order.CatVar))
["Zilch", "Medium", "High", "Low"]

cat_rev()¤

This function reverses the order of levels in a categorical variable. It only requires one argument - the column name whose levels are to be reversed

reversed_order = @chain df begin
    @mutate(CatVar = cat_rev(CatVar))
end

print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]

and

print(levels(reversed_order.CatVar))
["Zilch", "Low", "Medium", "High"]

cat_infreq()¤

This function reorders levels of a categorical variable based on their frequencies, with most frequent level first. The single argument is column name

@chain df begin
    @count(CatVar)
end

orderedbyfrequency = @chain df begin
    @mutate(CatVar = cat_infreq(CatVar))
end

print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]

and

print(levels(orderedbyfrequency.CatVar))


@chain df begin
    @count(CatVar)
end
4×2 DataFrame
RowCatVarn
Cat…Int64
1Medium11
2High19
3Low14
4Zilch13

cat_lump()¤

This function lumps the least frequent levels into a new "Other" level. It accepts two arguments - a column name and an integer specifying the number of levels to keep.

lumped_cats = @chain df begin
    @mutate(CatVar = cat_lump(CatVar,2))
end

print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]

and

print(levels(lumped_cats.CatVar))


@chain lumped_cats begin
    @count(CatVar)
end
3×2 DataFrame
RowCatVarn
Cat…Int64
1Other24
2High19
3Low14

cat_reorder()¤

This function reorders levels of a categorical variable based on a mean of a second variable. It takes three arguments - a categorical column , a numerical column by which to reorder, and a function to calculate the summary statistic (currently only supports mean, median). There is a fourth optional argument which defaults to true, if set to false, it order the categories in ascending order.

df3 = DataFrame(
    cat_var = repeat(["Low", "Medium", "High"], outer = 10),
    order_var = rand(30)
)

df4 = @chain df3 begin
    @mutate(cat_var= cat_reorder(cat_var, order_var, "median" ))
end


print(levels(df3.cat_var))
["High", "Low", "Medium"]

and

print(levels(df4.cat_var))


@chain df3 begin
    @mutate(catty = as_categorical(cat_var))
    @group_by(catty)
end

GroupedDataFrame with 3 groups based on key: catty

First Group (10 rows): catty = CategoricalValue{String, UInt32} "Low"
Rowcat_varorder_varcatty
StringFloat64Cat…
1Low0.0904901Low
2Low0.474166Low
3Low0.649207Low
4Low0.813709Low
5Low0.372275Low
6Low0.434213Low
7Low0.554675Low
8Low0.120808Low
9Low0.520476Low
10Low0.828585Low

⋮

Last Group (10 rows): catty = CategoricalValue{String, UInt32} "High"
Rowcat_varorder_varcatty
StringFloat64Cat…
1High0.0557954High
2High0.452822High
3High0.392587High
4High0.646916High
5High0.904497High
6High0.827882High
7High0.0906015High
8High0.683657High
9High0.389327High
10High0.692575High

cat_collapse()¤

This function collapses levels in a categorical variable according to a specified mapping. It requires two arguments - a categorical column and a dictionary that maps original levels to new ones.

df5 = @chain df begin
    @mutate(CatVar = cat_collapse(CatVar, Dict("Low" => "bad", "Zilch" => "bad")))
end

print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]

and

print(levels(df5.CatVar))
["High", "Medium", "bad"]

as_categorical()¤

This function converts a standard Julia array to a categorical array. The only argument it needs is the colunn name to be converted.

test = DataFrame( w = ["A", "B", "C", "D"])

@chain test begin
    @mutate(w = as_categorical(w))
end
4×1 DataFrame
Roww
Cat…
1A
2B
3C
4D

cat_lump_min()¤

This function wil lump any cargory with less than the minimum number of entries and recateogrize it as "Other" as the default, or a category name chosen by the user

@chain df begin
    @count(CatVar)
end
lumpedbymin = @chain df begin
    @mutate(CatVar = cat_lump_min(CatVar, 14))
end

print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]

and

print(levels(lumpedbymin.CatVar))
["High", "Low", "Other"]

cat_lump_min()¤

This function wil lump any cargory with less than the minimum proportion and recateogrize it as "Other" as the default, or a category name chosen by the user

lumpedbyprop = @chain df begin
    @mutate(CatVar = cat_lump_prop(CatVar, .25, "wow"))
end


print(levels(df.CatVar))
["High", "Medium", "Low", "Zilch"]

and

print(levels(lumpedbyprop.CatVar))
["High", "wow"]

This page was generated using Literate.jl.