Supported Functions

using TidierData
using TidierCats
using Random

Random.seed!(10)

categories = ["High", "Medium", "Low", "Zilch"]

random_indices = rand(1:length(categories), 57)

df = DataFrame(
    ID = 1:57,
    CatVar = categorical([categories[i] for i in random_indices], levels = categories)
)
first(df, 5)

5×2 DataFrame

Row	ID	CatVar
	Int64	Cat…
1	1	Medium
2	2	High
3	3	High
4	4	High
5	5	Medium

`cat_relevel()`¤

This function changes the order of levels in a categorical variable. It accepts two arguments - a column name and an array of levels in the desired order.

custom_order = @chain df begin
    @mutate(CatVar = cat_relevel(CatVar, ["Zilch", "Medium", "High", "Low"]))
end

print(levels(df.CatVar))

["High", "Medium", "Low", "Zilch"]

and

print(levels(custom_order.CatVar))

["Zilch", "Medium", "High", "Low"]

`cat_rev()`¤

This function reverses the order of levels in a categorical variable. It only requires one argument - the column name whose levels are to be reversed

reversed_order = @chain df begin
    @mutate(CatVar = cat_rev(CatVar))
end

print(levels(df.CatVar))

["High", "Medium", "Low", "Zilch"]

and

print(levels(reversed_order.CatVar))

["Zilch", "Low", "Medium", "High"]

`cat_infreq()`¤

This function reorders levels of a categorical variable based on their frequencies, with most frequent level first. The single argument is column name

@chain df begin
    @count(CatVar)
end

orderedbyfrequency = @chain df begin
    @mutate(CatVar = cat_infreq(CatVar))
end

print(levels(df.CatVar))

["High", "Medium", "Low", "Zilch"]

and

print(levels(orderedbyfrequency.CatVar))


@chain df begin
    @count(CatVar)
end

4×2 DataFrame

Row	CatVar	n
	Cat…	Int64
1	Medium	11
2	High	19
3	Low	14
4	Zilch	13

`cat_lump()`¤

This function lumps the least frequent levels into a new "Other" level. It accepts two arguments - a column name and an integer specifying the number of levels to keep.

lumped_cats = @chain df begin
    @mutate(CatVar = cat_lump(CatVar,2))
end

print(levels(df.CatVar))

["High", "Medium", "Low", "Zilch"]

and

print(levels(lumped_cats.CatVar))


@chain lumped_cats begin
    @count(CatVar)
end

3×2 DataFrame

Row	CatVar	n
	Cat…	Int64
1	Other	24
2	High	19
3	Low	14

`cat_reorder()`¤

This function reorders levels of a categorical variable based on a mean of a second variable. It takes three arguments - a categorical column , a numerical column by which to reorder, and a function to calculate the summary statistic (currently only supports mean, median). There is a fourth optional argument which defaults to true, if set to false, it order the categories in ascending order.

df3 = DataFrame(
    cat_var = repeat(["Low", "Medium", "High"], outer = 10),
    order_var = rand(30)
)

df4 = @chain df3 begin
    @mutate(cat_var= cat_reorder(cat_var, order_var, "median" ))
end


print(levels(df3.cat_var))

["High", "Low", "Medium"]

and

print(levels(df4.cat_var))


@chain df3 begin
    @mutate(catty = as_categorical(cat_var))
    @group_by(catty)
end

GroupedDataFrame with 3 groups based on key: catty

First Group (10 rows): catty = CategoricalValue{String, UInt32} "Low"

Row	cat_var	order_var	catty
	String	Float64	Cat…
1	Low	0.0904901	Low
2	Low	0.474166	Low
3	Low	0.649207	Low
4	Low	0.813709	Low
5	Low	0.372275	Low
6	Low	0.434213	Low
7	Low	0.554675	Low
8	Low	0.120808	Low
9	Low	0.520476	Low
10	Low	0.828585	Low

&vellip;

Last Group (10 rows): catty = CategoricalValue{String, UInt32} "High"

Row	cat_var	order_var	catty
	String	Float64	Cat…
1	High	0.0557954	High
2	High	0.452822	High
3	High	0.392587	High
4	High	0.646916	High
5	High	0.904497	High
6	High	0.827882	High
7	High	0.0906015	High
8	High	0.683657	High
9	High	0.389327	High
10	High	0.692575	High

`cat_collapse()`¤

This function collapses levels in a categorical variable according to a specified mapping. It requires two arguments - a categorical column and a dictionary that maps original levels to new ones.

df5 = @chain df begin
    @mutate(CatVar = cat_collapse(CatVar, Dict("Low" => "bad", "Zilch" => "bad")))
end

print(levels(df.CatVar))

["High", "Medium", "Low", "Zilch"]

and

print(levels(df5.CatVar))

["High", "Medium", "bad"]

`as_categorical()`¤

This function converts a standard Julia array to a categorical array. The only argument it needs is the colunn name to be converted.

test = DataFrame( w = ["A", "B", "C", "D"])

@chain test begin
    @mutate(w = as_categorical(w))
end

4×1 DataFrame

Row	w
	Cat…
1	A
2	B
3	C
4	D

`cat_lump_min()`¤

This function wil lump any cargory with less than the minimum number of entries and recateogrize it as "Other" as the default, or a category name chosen by the user

@chain df begin
    @count(CatVar)
end
lumpedbymin = @chain df begin
    @mutate(CatVar = cat_lump_min(CatVar, 14))
end

print(levels(df.CatVar))

["High", "Medium", "Low", "Zilch"]

and

print(levels(lumpedbymin.CatVar))

["High", "Low", "Other"]

`cat_lump_min()`¤

This function wil lump any cargory with less than the minimum proportion and recateogrize it as "Other" as the default, or a category name chosen by the user

lumpedbyprop = @chain df begin
    @mutate(CatVar = cat_lump_prop(CatVar, .25, "wow"))
end


print(levels(df.CatVar))

["High", "Medium", "Low", "Zilch"]

and

print(levels(lumpedbyprop.CatVar))

["High", "wow"]

This page was generated using Literate.jl.

Supported Functions

cat_relevel()¤

cat_rev()¤

cat_infreq()¤

cat_lump()¤

cat_reorder()¤

cat_collapse()¤

as_categorical()¤

cat_lump_min()¤

cat_lump_min()¤

`cat_relevel()`¤

`cat_rev()`¤

`cat_infreq()`¤

`cat_lump()`¤

`cat_reorder()`¤

`cat_collapse()`¤

`as_categorical()`¤

`cat_lump_min()`¤

`cat_lump_min()`¤