Reference

Index¤

Reference - Exported functions¤

# TidierData.TidierData_set — Method.

TidierData_set(option::AbstractString, value::Bool)

Set package options.

Here are the supported options and what they do:

"code": Defaults to false. If set to true, this option displays the DataFrames.jl code generated by the TidierData.jl package. It is useful for debugging whether errors are introduced by TidierData.jl's generated code.

Arguments

option: "code"
value: true or false

source

# TidierData.across — Method.

across(variable[s], function[s])

Apply functions to multiple variables. If specifying multiple variables or functions, surround them with parentheses so that they are recognized as a tuple.

This function should only be called inside of TidierData.jl macros.

Arguments

variable[s]: An unquoted variable, or if multiple, an unquoted tuple of variables.
function[s]: A function, or if multiple, a tuple of functions.

Examples

julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);

julia> @chain df begin
         @summarize(across(b, minimum))
       end
1×1 DataFrame
 Row │ b_minimum 
     │ Int64     
─────┼───────────
   1 │         1

julia> @chain df begin
         @summarize(across(where(is_number), minimum))
       end
1×2 DataFrame
 Row │ b_minimum  c_minimum 
     │ Int64      Int64     
─────┼──────────────────────
   1 │         1         11

julia> @chain df begin
         @summarize(across((b,c), (minimum, maximum)))
       end
1×4 DataFrame
 Row │ b_minimum  c_minimum  b_maximum  c_maximum 
     │ Int64      Int64      Int64      Int64     
─────┼────────────────────────────────────────────
   1 │         1         11          5         15

julia> @chain df begin
         @mutate(across((b,c), (minimum, maximum)))
       end
5×7 DataFrame
 Row │ a     b      c      b_minimum  c_minimum  b_maximum  c_maximum 
     │ Char  Int64  Int64  Int64      Int64      Int64      Int64     
─────┼────────────────────────────────────────────────────────────────
   1 │ a         1     11          1         11          5         15
   2 │ b         2     12          1         11          5         15
   3 │ c         3     13          1         11          5         15
   4 │ d         4     14          1         11          5         15
   5 │ e         5     15          1         11          5         15

julia> @chain df begin
         @mutate(across((b, starts_with("c")), (minimum, maximum)))
       end
5×7 DataFrame
 Row │ a     b      c      b_minimum  c_minimum  b_maximum  c_maximum 
     │ Char  Int64  Int64  Int64      Int64      Int64      Int64     
─────┼────────────────────────────────────────────────────────────────
   1 │ a         1     11          1         11          5         15
   2 │ b         2     12          1         11          5         15
   3 │ c         3     13          1         11          5         15
   4 │ d         4     14          1         11          5         15
   5 │ e         5     15          1         11          5         15

source

# TidierData.as_float — Method.

as_float(value)

Convert a number or string to a Float64 data type.

This is a useful helper for type conversions. Missing values are propagated.

Arguments

value: An AbstractString, Number, or missing value.

Examples

julia> as_float(1)
1.0

julia> as_float("1.5")
1.5

julia> as_float(missing)
missing

source

# TidierData.as_integer — Method.

as_integer(value)

Convert a number or string to an Int64 data type.

This is a useful helper for type conversions. Missing values are propagated. Any values after the decimal point are removed.

Arguments

value: An AbstractString, Number, or missing value.

Examples

julia> as_integer(1)
1

julia> as_integer(1.5)
1

julia> as_integer("2")
2

julia> as_integer("2.5")
2

julia> as_integer(missing)
missing

source

# TidierData.as_string — Method.

as_string(value)

Convert a number or string to a String data type.

This is a useful helper for type conversions. Missing values are propagated.

Arguments

value: An AbstractString, Number, or missing value.

Examples

julia> as_string(1)
"1"

julia> as_string(1.5)
"1.5"

julia> as_string(missing)
missing

source

# TidierData.case_when — Method.

case_when(condition => return_value)
case_when(condition_1 => return_value_1, condition_2 => return_value_2, ...)

Return the corresponding return_value for the first condition that evaluates to true.

The most specific condition should be listed first and most general condition should be listed last. If none of the conditions evaluate to true, then a missing value is returned.

Arguments

condition: A condition that evaluates to true, false, or missing.
return_value: The value to return if the condition is true.

Examples

julia> df = DataFrame(a = [1, 2, missing, 4, 5]);

julia> @chain df begin
         @mutate(b = case_when(a > 4  =>  "hi",
                               a > 2  =>  "medium",
                               a > 0  =>  "low"))
       end
5×2 DataFrame
 Row │ a        b       
     │ Int64?   String? 
─────┼──────────────────
   1 │       1  low
   2 │       2  low
   3 │ missing  missing 
   4 │       4  medium
   5 │       5  hi

julia> @chain df begin
         @mutate(b = case_when(a > 4  =>  "hi",
                               a > 2  =>  "medium",
                               a > 0  =>  "low",
                               true   =>  "unknown"))
       end
5×2 DataFrame
 Row │ a        b       
     │ Int64?   String  
─────┼──────────────────
   1 │       1  low
   2 │       2  low
   3 │ missing  unknown
   4 │       4  medium
   5 │       5  hi

julia> @chain df begin
         @mutate(b = case_when(a >= 3  =>  3,
                               true    =>  a))
       end
5×2 DataFrame
 Row │ a        b       
     │ Int64?   Int64?  
─────┼──────────────────
   1 │       1        1
   2 │       2        2
   3 │ missing  missing 
   4 │       4        3
   5 │       5        3

julia> @chain df begin
         @mutate(b = case_when(a >= 3        =>  3,
                               ismissing(a)  =>  0,
                               true          =>  a))
       end
5×2 DataFrame
 Row │ a        b     
     │ Int64?   Int64 
─────┼────────────────
   1 │       1      1
   2 │       2      2
   3 │ missing      0
   4 │       4      3
   5 │       5      3

source

# TidierData.desc — Method.

desc(col)

Orders the rows of a DataFrame column in descending order when used inside of @arrange(). This function should only be called inside of `@arrange()``.

Arguments

col: An unquoted column name.

Examples

julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = 1:10, c = 11:20);

julia> @chain df begin
         @arrange(a, desc(b))
       end
10×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         2     12
   2 │ a         1     11
   3 │ b         4     14
   4 │ b         3     13
   5 │ c         6     16
   6 │ c         5     15
   7 │ d         8     18
   8 │ d         7     17
   9 │ e        10     20
  10 │ e         9     19

source

# TidierData.ends_with — Method.

ends_with(suffix)

Select all columns ending with the suffix.

Arguments

suffix: A string.

Examples

julia> df = DataFrame(a_1 = 1:5, a_2 = 11:15, b_1 = 21:25);

julia> @chain df begin 
         @select(ends_with("1"))
       end
5×2 DataFrame
 Row │ a_1    b_1   
     │ Int64  Int64 
─────┼──────────────
   1 │     1     21
   2 │     2     22
   3 │     3     23
   4 │     4     24
   5 │     5     25

source

# TidierData.everything — Method.

everything()

Select all (remaining) columns.

Arguments

None

Examples

julia> df = DataFrame(a_1 = 1:5, a_2 = 11:15, b_1 = 21:25);

julia> @chain df begin 
         @select(b_1, everything())
       end
5×3 DataFrame
 Row │ b_1    a_1    a_2   
     │ Int64  Int64  Int64 
─────┼─────────────────────
   1 │    21      1     11
   2 │    22      2     12
   3 │    23      3     13
   4 │    24      4     14
   5 │    25      5     15

source

# TidierData.if_else — Method.

if_else(condition, yes, no, [miss])

Return the yes value if the condition is true and the no value if the condition is false. If miss is specified, then the provided miss value is returned when the condition contains a missing value. If miss is not specified, then the returned value is an explicit missing value.

Arguments

condition: A condition that evaluates to true, false, or missing.
yes: Value to return if the condition is true.
no: Value to return if the condition is false.
miss: Optional. Value to return if the condition is missing.

Examples

julia> df = DataFrame(a = [1, 2, missing, 4, 5]);

julia> @chain df begin
         @mutate(b = if_else(a >= 3, "yes", "no"))
       end
5×2 DataFrame
 Row │ a        b       
     │ Int64?   String? 
─────┼──────────────────
   1 │       1  no
   2 │       2  no
   3 │ missing  missing 
   4 │       4  yes
   5 │       5  yes

julia> @chain df begin
         @mutate(b = if_else(a >= 3, "yes", "no", "unknown"))
       end
5×2 DataFrame
 Row │ a        b       
     │ Int64?   String  
─────┼──────────────────
   1 │       1  no
   2 │       2  no
   3 │ missing  unknown
   4 │       4  yes
   5 │       5  yes

julia> @chain df begin
         @mutate(b = if_else(a >= 3, 3, a))
       end
5×2 DataFrame
 Row │ a        b       
     │ Int64?   Int64?  
─────┼──────────────────
   1 │       1        1
   2 │       2        2
   3 │ missing  missing 
   4 │       4        3
   5 │       5        3

julia> @chain df begin
         @mutate(b = if_else(a >= 3, 3, a, 0))
       end
5×2 DataFrame
 Row │ a        b     
     │ Int64?   Int64 
─────┼────────────────
   1 │       1      1
   2 │       2      2
   3 │ missing      0
   4 │       4      3
   5 │       5      3

source

# TidierData.is_float — Method.

is_float(column::AbstractVector)

Determine if the given column contains floating-point numbers.

Arguments

column::AbstractVector: The column whose data type needs to be checked.

Returns

Bool: true if the column contains floating-point numbers, false otherwise.

Examples

julia> df = DataFrame(b = [missing, 2, 3],
                      c = [missing, 2.2, 34],
                      d = [missing, missing, "A"]);

julia> is_float(df.c)
true

julia> is_float(df.b)
false

source

# TidierData.is_integer — Method.

is_integer(column::AbstractVector)

Determine if the given column contains integers.

Arguments

column::AbstractVector: The column whose data type needs to be checked.

Returns

Bool: true if the column contains integers, false otherwise.

Examples

julia> df = DataFrame(b = [missing, 2, 3],
                      c = [missing, 2.2, 34],
                      d = [missing, missing, "A"]);

julia> is_integer(df.b)
true

julia> is_integer(df.d)
false

source

# TidierData.is_number — Method.

is_number(column::AbstractVector)

Determine if the given column contains numbers.

Arguments

column::AbstractVector: The column whose data type needs to be checked.

Returns

Bool: true if the column contains numbers, false otherwise.

Examples

julia> df = DataFrame(b = [missing, 2, 3],
                      c = [missing, 2.2, 34],
                      d = [missing, missing, "A"]);

julia> is_number(df.b)
true

julia> is_number(df.c)
true

julia> is_number(df.d)
false

source

# TidierData.is_string — Method.

is_string(column::AbstractVector)

Determine if the given column contains strings.

Arguments

column::AbstractVector: The column whose data type needs to be checked.

Returns

Bool: true if the column contains strings, false otherwise.

Examples

julia> df = DataFrame(b = [missing, 2, 3],
                      c = [missing, 2.2, 34],
                      d = [missing, missing, "A"]);

julia> is_string(df.d)
true

julia> is_string(df.c)
false

source

# TidierData.matches — Method.

matches(pattern, [flags])

Select all columns matching the pattern.

Arguments

pattern: A string.
flags: Optional string containing flags. "i" = Do case-insensitive pattern matching. "m" = Treat string as multiple lines. "s" = Treat string as a single line. "x" = Tells the regular expression parser to ignore most whitespace that is neither backslashed nor within a character class. You

can use this to break up your regular expression into (slightly) more readable parts.

Examples

julia> df = DataFrame(a_1 = 1:5, a_2 = 11:15, b_1 = 21:25);

julia> @chain df begin 
         @select(matches("^a"))
       end
5×2 DataFrame
 Row │ a_1    a_2   
     │ Int64  Int64 
─────┼──────────────
   1 │     1     11
   2 │     2     12
   3 │     3     13
   4 │     4     14
   5 │     5     15

julia> @chain df begin 
         @select(matches("1$"))
       end
5×2 DataFrame
 Row │ a_1    b_1   
     │ Int64  Int64 
─────┼──────────────
   1 │     1     21
   2 │     2     22
   3 │     3     23
   4 │     4     24
   5 │     5     25

julia> @chain df begin 
         @select(matches("A", "i"))
       end
5×2 DataFrame
 Row │ a_1    a_2   
     │ Int64  Int64 
─────┼──────────────
   1 │     1     11
   2 │     2     12
   3 │     3     13
   4 │     4     14
   5 │     5     15

source

# TidierData.missing_if — Method.

missing_if(x, value)

Replace a specific value with missing in x.

Arguments

x: The input value which can be of any type. If x is already missing or equals value, the function will return missing. Otherwise, it returns x unaltered.
value: The specific value to be checked against.

Examples

julia> df = DataFrame(
              a = [1, missing, 3, 4],
              b = ["apple", "apple", "banana", "cherry"]
            );

julia> @chain df begin
         @mutate(a = missing_if(a, 4), 
                 b = missing_if(b, "apple"))
       end
4×2 DataFrame
 Row │ a        b       
     │ Int64?   String? 
─────┼──────────────────
   1 │       1  missing 
   2 │ missing  missing 
   3 │       3  banana
   4 │ missing  cherry

source

# TidierData.n — Method.

n()

Return the number of rows in the DataFrame or in the group if used in the context of a GroupedDataFrame.

Arguments

None

Examples

julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = 1:10, c = 11:20);

julia> @chain df begin
         @summarize(n = n())
       end
1×1 DataFrame
 Row │ n     
     │ Int64 
─────┼───────
   1 │    10

julia> @chain df begin
         @group_by(a)
         @summarize(n = n())
       end
5×2 DataFrame
 Row │ a     n     
     │ Char  Int64 
─────┼─────────────
   1 │ a         2
   2 │ b         2
   3 │ c         2
   4 │ d         2
   5 │ e         2

source

# TidierData.ntile — Method.

ntile(x, n::Integer)

Break the input vector into n equal-sized buckets.

ntile() is a rough rank that breaks the input vector into n buckets. If length(x) is not an integer multiple of n, the size of the buckets will differ by up to one, with larger buckets coming first.

Unlike other ranking functions, ntile() ignores ties: it will create evenly sized buckets even if the same value of x ends up in different buckets.

Arguments

x: A vector to rank. By default, the smallest values will get the smallest ranks. Missing values will be given rank missing.
n: Number of groups to bucket into.

Examples

julia> x = [5,1,3,2,2, missing]
6-element Vector{Union{Missing, Int64}}:
 5
 1
 3
 2
 2
  missing

julia> ntile(x, 2)
6-element Vector{Union{Missing, Int64}}:
 2
 1
 2
 1
 1
  missing

julia> ntile(x, 4)
6-element Vector{Union{Missing, Int64}}:
 4
 1
 3
 1
 2
  missing

julia> ntile(1:8, 3)
8-element Vector{Int64}:
 1
 1
 1
 2
 2
 2
 3
 3

julia> df = DataFrame(a = 1:8);

julia> @chain df begin
       @mutate(buckets = ntile(a, 3))
       end
8×2 DataFrame
 Row │ a      buckets 
     │ Int64  Int64   
─────┼────────────────
   1 │     1        1
   2 │     2        1
   3 │     3        1
   4 │     4        2
   5 │     5        2
   6 │     6        2
   7 │     7        3
   8 │     8        3

source

# TidierData.replace_missing — Method.

replace_missing(x, replacement)

Replace missing values in x with a specified replacement value.

Arguments

x: The input value which can be of any type. If x is missing, the function will return replacement. Otherwise, it returns x unaltered.
replacement: The value to replace missing with in x.

Examples

julia> df = DataFrame(
              a = [1, missing, 3, 4],
              b = [4, 5, missing, 8]
            );

julia> @chain df begin
         @mutate(a = replace_missing(a, 100),
                 b = replace_missing(b, 35))
       end
4×2 DataFrame
 Row │ a      b     
     │ Int64  Int64 
─────┼──────────────
   1 │     1      4
   2 │   100      5
   3 │     3     35
   4 │     4      8

source

# TidierData.row_number — Method.

row_number()

Return each row's number in a DataFrame or in the group if used in the context of a GroupedDataFrame.

Arguments

None

Examples

julia> df = DataFrame(a = repeat('a':'e', inner = 2));

julia> @chain df begin
         @mutate(row_num = row_number())
       end
10×2 DataFrame
 Row │ a     row_num 
     │ Char  Int64   
─────┼───────────────
   1 │ a           1
   2 │ a           2
   3 │ b           3
   4 │ b           4
   5 │ c           5
   6 │ c           6
   7 │ d           7
   8 │ d           8
   9 │ e           9
  10 │ e          10

julia> @chain df begin
         @mutate(row_num = row_number() + 1)
       end
10×2 DataFrame
 Row │ a     row_num 
     │ Char  Int64   
─────┼───────────────
   1 │ a           2
   2 │ a           3
   3 │ b           4
   4 │ b           5
   5 │ c           6
   6 │ c           7
   7 │ d           8
   8 │ d           9
   9 │ e          10
  10 │ e          11

julia> @chain df begin
         @filter(row_number() <= 5)
       end
5×1 DataFrame
 Row │ a    
     │ Char 
─────┼──────
   1 │ a
   2 │ a
   3 │ b
   4 │ b
   5 │ c

source

# TidierData.starts_with — Method.

starts_with(prefix)

Select all columns starting with the prefix.

Arguments

prefix: A string.

Examples

julia> df = DataFrame(a_1 = 1:5, a_2 = 11:15, b_1 = 21:25);

julia> @chain df begin 
         @select(starts_with("a"))
       end
5×2 DataFrame
 Row │ a_1    a_2   
     │ Int64  Int64 
─────┼──────────────
   1 │     1     11
   2 │     2     12
   3 │     3     13
   4 │     4     14
   5 │     5     15

source

# TidierData.where — Method.

where(function)

Selects columns on which a function returns true for all values of the column.

This function should only be called inside of TidierData.jl macros.

Arguments

function: A predicate function (one that returns true or false).

Examples

julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);

julia> @chain df begin
         @select(where(is_number))
       end
5×2 DataFrame
 Row │ b      c     
     │ Int64  Int64 
─────┼──────────────
   1 │     1     11
   2 │     2     12
   3 │     3     13
   4 │     4     14
   5 │     5     15

julia> @chain df begin
         @summarize(across(where(is_number), minimum))
       end
1×2 DataFrame
 Row │ b_minimum  c_minimum 
     │ Int64      Int64     
─────┼──────────────────────
   1 │         1         11

julia> @chain df begin
         @mutate(across(where(is_number), minimum))
       end
5×5 DataFrame
 Row │ a     b      c      b_minimum  c_minimum 
     │ Char  Int64  Int64  Int64      Int64     
─────┼──────────────────────────────────────────
   1 │ a         1     11          1         11
   2 │ b         2     12          1         11
   3 │ c         3     13          1         11
   4 │ d         4     14          1         11
   5 │ e         5     15          1         11

julia> df = DataFrame(a = repeat('a':'e', inner = 3),
                      b = 1:15,
                      c = 16:30,
                      d = 31:45);

julia> @chain df begin
         @group_by(a)
         @summarize(across(where(is_number), mean))
       end
5×4 DataFrame
 Row │ a     b_mean   c_mean   d_mean  
     │ Char  Float64  Float64  Float64 
─────┼─────────────────────────────────
   1 │ a         2.0     17.0     32.0
   2 │ b         5.0     20.0     35.0
   3 │ c         8.0     23.0     38.0
   4 │ d        11.0     26.0     41.0
   5 │ e        14.0     29.0     44.0

source

# TidierData.@anti_join — Macro.

@anti_join(df1, df2, [by])

Perform an anti-join on df1 and df2 with an optional by.

Arguments

df1: A DataFrame.
df2: A DataFrame.
by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

Examples

julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);

julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);

julia> @anti_join(df1, df2)
1×2 DataFrame
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ b           2

julia> @anti_join(df1, df2, a)
1×2 DataFrame
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ b           2

julia> @anti_join(df1, df2, a = a)
1×2 DataFrame
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ b           2

julia> @anti_join(df1, df2, "a")
1×2 DataFrame
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ b           2

julia> @anti_join(df1, df2, "a" = "a")
1×2 DataFrame
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ b           2

source

# TidierData.@arrange — Macro.

@arrange(df, exprs...)

Order the rows of a DataFrame by the values of specified columns.

Arguments

df: A DataFrame.
exprs...: Variables from the input DataFrame. Use desc() to sort in descending order. Multiple variables can be specified, separated by commas.

Examples

julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = 1:10, c = 11:20);

julia> @chain df begin
         @arrange(a)
       end
10×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ a         2     12
   3 │ b         3     13
   4 │ b         4     14
   5 │ c         5     15
   6 │ c         6     16
   7 │ d         7     17
   8 │ d         8     18
   9 │ e         9     19
  10 │ e        10     20

julia> @chain df begin
         @arrange(a, desc(b))
       end
10×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         2     12
   2 │ a         1     11
   3 │ b         4     14
   4 │ b         3     13
   5 │ c         6     16
   6 │ c         5     15
   7 │ d         8     18
   8 │ d         7     17
   9 │ e        10     20
  10 │ e         9     19

source

# TidierData.@bind_cols — Macro.

@bind_cols(dfs...)

Bind many DataFrames into one by column.

Arguments

dfs...: DataFrames to combine.

Examples

julia> df1 = DataFrame(a=1:3, b=1:3);

julia> df2 = DataFrame(a=4:6, b=4:6);

julia> df3 = DataFrame(a=7:9, c=7:9);

julia> @chain df1 begin
         @bind_cols(df2, df3)
       end
3×6 DataFrame
 Row │ a      b      a_1    b_1    a_2    c     
     │ Int64  Int64  Int64  Int64  Int64  Int64 
─────┼──────────────────────────────────────────
   1 │     1      1      4      4      7      7
   2 │     2      2      5      5      8      8
   3 │     3      3      6      6      9      9

source

# TidierData.@bind_rows — Macro.

@bind_rows(dfs..., id)

Bind many DataFrames into one by row.

Columns present in at least one of the provided DataFrames are kept. Columns not present in some DataFrames are filled with missing values where necessary.

Arguments

dfs...: DataFrames to combine.
id: string DataFrame identifier. When id is supplied, a new column of numeric identifiers is created to link each row to its original DataFrame.

Examples

julia> df1 = DataFrame(a=1:3, b=1:3);

julia> df2 = DataFrame(a=4:6, b=4:6);

julia> df3 = DataFrame(a=7:9, c=7:9);

julia> @chain df1 begin
         @bind_rows(df2)
       end
6×2 DataFrame
 Row │ a      b     
     │ Int64  Int64 
─────┼──────────────
   1 │     1      1
   2 │     2      2
   3 │     3      3
   4 │     4      4
   5 │     5      5
   6 │     6      6

When columns are not present in some DataFrames, they are filled with missing values.

julia> @chain df1 begin
         @bind_rows(df2, df3)
       end
9×3 DataFrame
 Row │ a      b        c       
     │ Int64  Int64?   Int64?  
─────┼─────────────────────────
   1 │     1        1  missing 
   2 │     2        2  missing 
   3 │     3        3  missing 
   4 │     4        4  missing 
   5 │     5        5  missing 
   6 │     6        6  missing 
   7 │     7  missing        7
   8 │     8  missing        8
   9 │     9  missing        9

julia> @chain df1 begin
         @bind_rows(df2, df3, id = "id")
       end
9×4 DataFrame
 Row │ a      b        c        id    
     │ Int64  Int64?   Int64?   Int64 
─────┼────────────────────────────────
   1 │     1        1  missing      1
   2 │     2        2  missing      1
   3 │     3        3  missing      1
   4 │     4        4  missing      2
   5 │     5        5  missing      2
   6 │     6        6  missing      2
   7 │     7  missing        7      3
   8 │     8  missing        8      3
   9 │     9  missing        9      3

source

# TidierData.@count — Macro.

@count(df, exprs..., [wt], [sort])

Count the unique values of one or more variables, with an optional weighting.

@chain df @count(a, b) is roughly equivalent to @chain df @group_by(a, b) @summarize(n = n()). Supply wt to perform weighted counts, switching the summary from n = n() to n = sum(wt). Note that if grouping columns are provided, the result will be an ungrouped data frame, which is slightly different behavior than R's tidyverse.

Arguments

df: A DataFrame or GroupedDataFrame.
exprs...: Column names, separated by commas.
wt: Optional parameter. Used to calculate a sum over the provided wt variable instead of counting the rows.
sort: Defaults to false. Whether the result should be sorted from highest to lowest n.

Examples

julia> df = DataFrame(a = vcat(repeat(["a"], inner = 3),
                           repeat(["b"], inner = 3),
                           repeat(["c"], inner = 1),
                           missing),
                      b = 1:8)
8×2 DataFrame
 Row │ a        b     
     │ String?  Int64 
─────┼────────────────
   1 │ a            1
   2 │ a            2
   3 │ a            3
   4 │ b            4
   5 │ b            5
   6 │ b            6
   7 │ c            7
   8 │ missing      8

julia> @chain df @count()
1×1 DataFrame
 Row │ n     
     │ Int64 
─────┼───────
   1 │     8

julia> @chain df begin
         @count(a)
       end
4×2 DataFrame
 Row │ a        n     
     │ String?  Int64 
─────┼────────────────
   1 │ a            3
   2 │ b            3
   3 │ c            1
   4 │ missing      1

julia> @chain df begin
         @count(a, wt = b)
       end
4×2 DataFrame
 Row │ a        n     
     │ String?  Int64 
─────┼────────────────
   1 │ a            6
   2 │ b           15
   3 │ c            7
   4 │ missing      8

julia> @chain df begin
         @count(a, wt = b, sort = true)
       end
4×2 DataFrame
 Row │ a        n     
     │ String?  Int64 
─────┼────────────────
   1 │ b           15
   2 │ missing      8
   3 │ c            7
   4 │ a            6

source

# TidierData.@distinct — Macro.

distinct(df, exprs...)

Return distinct rows of a DataFrame.

If no columns or expressions are provided, then unique rows across all columns are returned. Otherwise, unique rows are determined based on the columns or expressions provided, and then all columns are returned.

Arguments

df: A DataFrame.
exprs...: One or more unquoted variable names separated by commas. Variable names can also be used as their positions in the data, like x:y, to select a range of variables.

Examples

julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = repeat(1:5, 2), c = 11:20);

julia> @chain df @distinct()
10×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ a         2     12
   3 │ b         3     13
   4 │ b         4     14
   5 │ c         5     15
   6 │ c         1     16
   7 │ d         2     17
   8 │ d         3     18
   9 │ e         4     19
  10 │ e         5     20

julia> @chain df @distinct(a)
5×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ b         3     13
   3 │ c         5     15
   4 │ d         2     17
   5 │ e         4     19

julia> @chain df begin
         @distinct(starts_with("a"))
       end
5×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ b         3     13
   3 │ c         5     15
   4 │ d         2     17
   5 │ e         4     19

julia> @chain df begin
         @distinct(a, b)
       end
10×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ a         2     12
   3 │ b         3     13
   4 │ b         4     14
   5 │ c         5     15
   6 │ c         1     16
   7 │ d         2     17
   8 │ d         3     18
   9 │ e         4     19
  10 │ e         5     20

source

# TidierData.@drop_missing — Macro.

@drop_missing(df, [cols...])

Drop all rows with missing values.

When called without arguments, @drop_missing() drops all rows with missing values in any column. If columns are provided as an optional argument, only missing values from named columns are considered when dropping rows.

Arguments

df: A DataFrame or GroupedDataFrame.
cols...: An optional column, or multiple columns separated by commas or specified using selection helpers.

Examples

julia> df = DataFrame(
              a = [1, 2, missing, 4],
              b = [1, missing, 3, 4]
            )
4×2 DataFrame
 Row │ a        b       
     │ Int64?   Int64?  
─────┼──────────────────
   1 │       1        1
   2 │       2  missing 
   3 │ missing        3
   4 │       4        4

julia> @chain df @drop_missing()
2×2 DataFrame
 Row │ a      b     
     │ Int64  Int64 
─────┼──────────────
   1 │     1      1
   2 │     4      4

julia> @chain df @drop_missing(a)
3×2 DataFrame
 Row │ a      b       
     │ Int64  Int64?  
─────┼────────────────
   1 │     1        1
   2 │     2  missing 
   3 │     4        4

julia> @chain df @drop_missing(a, b)
2×2 DataFrame
 Row │ a      b     
     │ Int64  Int64 
─────┼──────────────
   1 │     1      1
   2 │     4      4

julia> @chain df @drop_missing(starts_with("a"))
3×2 DataFrame
 Row │ a      b       
     │ Int64  Int64?  
─────┼────────────────
   1 │     1        1
   2 │     2  missing 
   3 │     4        4

source

# TidierData.@fill_missing — Macro.

@fill_missing(df, [columns...], direction)

Fill missing values in a DataFrame df using the specified method.

Arguments

df: The DataFrame or GroupedDataFrame in which you want to fill missing values.
columns: (Optional) The columns for which missing values need to be filled, separated by commas. If not provided, the operation is applied to all columns.
direction: A string containing the method to use for filling missing values. Options include: "down" (last observation carried forward) or "up" (next observation carried backward).

Examples

julia> df = DataFrame(
          dt1 = [missing, 0.2, missing, missing, 1, missing, 5, 6],
          dt2 = [0.3, 2, missing, 3, missing, 5, 6,missing],
          dt3 = [missing, 0.2, missing, missing, 1, missing, 5, 6],
          dt4 = [0.3, missing, missing, 3, missing, 5, 6, missing],
          dt5 = ['a', 'b', 'a', 'b', 'a', 'a', 'a', 'b']);

julia> @fill_missing(df, dt2, dt4, "down")
8×5 DataFrame
 Row │ dt1        dt2       dt3        dt4       dt5  
     │ Float64?   Float64?  Float64?   Float64?  Char 
─────┼────────────────────────────────────────────────
   1 │ missing         0.3  missing         0.3  a
   2 │       0.2       2.0        0.2       0.3  b
   3 │ missing         2.0  missing         0.3  a
   4 │ missing         3.0  missing         3.0  b
   5 │       1.0       3.0        1.0       3.0  a
   6 │ missing         5.0  missing         5.0  a
   7 │       5.0       6.0        5.0       6.0  a
   8 │       6.0       6.0        6.0       6.0  b

julia> @chain df begin
         @fill_missing("up")
       end
8×5 DataFrame
 Row │ dt1       dt2        dt3       dt4        dt5  
     │ Float64?  Float64?   Float64?  Float64?   Char 
─────┼────────────────────────────────────────────────
   1 │      0.2        0.3       0.2        0.3  a
   2 │      0.2        2.0       0.2        3.0  b
   3 │      1.0        3.0       1.0        3.0  a
   4 │      1.0        3.0       1.0        3.0  b
   5 │      1.0        5.0       1.0        5.0  a
   6 │      5.0        5.0       5.0        5.0  a
   7 │      5.0        6.0       5.0        6.0  a
   8 │      6.0  missing         6.0  missing    b 

julia> @chain df begin
         @group_by(dt5)
         @fill_missing(dt1, "up")
       end
GroupedDataFrame with 2 groups based on key: dt5
First Group (5 rows): dt5 = 'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase)
 Row │ dt1       dt2        dt3        dt4        dt5  
     │ Float64?  Float64?   Float64?   Float64?   Char 
─────┼─────────────────────────────────────────────────
   1 │      1.0        0.3  missing          0.3  a
   2 │      1.0  missing    missing    missing    a
   3 │      1.0  missing          1.0  missing    a
   4 │      5.0        5.0  missing          5.0  a
   5 │      5.0        6.0        5.0        6.0  a
⋮
Last Group (3 rows): dt5 = 'b': ASCII/Unicode U+0062 (category Ll: Letter, lowercase)
 Row │ dt1       dt2        dt3        dt4        dt5  
     │ Float64?  Float64?   Float64?   Float64?   Char 
─────┼─────────────────────────────────────────────────
   1 │      0.2        2.0        0.2  missing    b
   2 │      6.0        3.0  missing          3.0  b
   3 │      6.0  missing          6.0  missing    b

source

# TidierData.@filter — Macro.

@filter(df, exprs...)

Subset a DataFrame and return a copy of DataFrame where specified conditions are satisfied.

Arguments

df: A DataFrame.
exprs...: transformation(s) that produce vectors containing true or false.

Examples

julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);

julia> @chain df begin
         @filter(b >= mean(b))
       end
3×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ c         3     13
   2 │ d         4     14
   3 │ e         5     15

julia> @chain df begin
         @filter(b >= 3 && c >= 14)
       end
2×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ d         4     14
   2 │ e         5     15

julia> @chain df begin
         @filter(b in (1, 3))
       end
2×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ c         3     13

source

# TidierData.@full_join — Macro.

@full_join(df1, df2, [by])

Perform a full join on df1 and df2 with an optional by.

Arguments

df1: A DataFrame.
df2: A DataFrame.
by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

Examples

julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);

julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);

julia> @full_join(df1, df2)
3×3 DataFrame
 Row │ a       b        c       
     │ String  Int64?   Int64?  
─────┼──────────────────────────
   1 │ a             1        3
   2 │ b             2  missing 
   3 │ c       missing        4

julia> @full_join(df1, df2, a)
3×3 DataFrame
 Row │ a       b        c       
     │ String  Int64?   Int64?  
─────┼──────────────────────────
   1 │ a             1        3
   2 │ b             2  missing 
   3 │ c       missing        4

julia> @full_join(df1, df2, a = a)
3×3 DataFrame
 Row │ a       b        c       
     │ String  Int64?   Int64?  
─────┼──────────────────────────
   1 │ a             1        3
   2 │ b             2  missing 
   3 │ c       missing        4

julia> @full_join(df1, df2, "a")
3×3 DataFrame
 Row │ a       b        c       
     │ String  Int64?   Int64?  
─────┼──────────────────────────
   1 │ a             1        3
   2 │ b             2  missing 
   3 │ c       missing        4

julia> @full_join(df1, df2, "a" = "a")
3×3 DataFrame
 Row │ a       b        c       
     │ String  Int64?   Int64?  
─────┼──────────────────────────
   1 │ a             1        3
   2 │ b             2  missing 
   3 │ c       missing        4

source

# TidierData.@glimpse — Macro.

@glimpse(df, width = 80)

Preview a DataFrame (or GroupedDataFrame).

The @glimpse macro is used to preview a DataFrame or GroupedDataFrame. Each column is printed on a separate row, along with its data type and first few elements, with the output truncated based on the width.

Arguments

df: A DataFrame or GroupedDataFrame.
width: The width of the output, measured in the number of characters. Defaults to 80.

Examples

julia> df = DataFrame(
               a = 1:100, 
               b = 1:100, 
               c = repeat(["a"], 100)
               );

julia> @chain df @glimpse
Rows: 100
Columns: 3
.a             Int64          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
.b             Int64          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
.c             String         a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,

julia> @chain df begin
       @group_by(a)
       @glimpse()
       end
Rows: 100
Columns: 3
Groups: a [100]
.a             Int64          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
.b             Int64          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
.c             String         a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,

source

# TidierData.@group_by — Macro.

@group_by(df, exprs...)

Return a GroupedDataFrame where operations are performed by groups specified by unique sets of cols.

Arguments

df: A DataFrame.
exprs...: DataFrame columns to group by or tidy expressions. Can be a single tidy expression or multiple expressions separated by commas.

Examples

julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);

julia> @chain df begin
         @group_by(a)
         @summarize(b = mean(b))
       end
5×2 DataFrame
 Row │ a     b       
     │ Char  Float64 
─────┼───────────────
   1 │ a         1.0
   2 │ b         2.0
   3 │ c         3.0
   4 │ d         4.0
   5 │ e         5.0  

julia> @chain df begin
         @group_by(d = uppercase(a))
         @summarize(b = mean(b))
       end
5×2 DataFrame
 Row │ d     b       
     │ Char  Float64 
─────┼───────────────
   1 │ A         1.0
   2 │ B         2.0
   3 │ C         3.0
   4 │ D         4.0
   5 │ E         5.0

julia> @chain df begin
         @group_by(-(b, c)) # same as `a`
         @summarize(b = mean(b))
       end
5×2 DataFrame
 Row │ a     b       
     │ Char  Float64 
─────┼───────────────
   1 │ a         1.0
   2 │ b         2.0
   3 │ c         3.0
   4 │ d         4.0
   5 │ e         5.0

julia> @chain df begin
         @group_by(!(b, c)) # same as `a`
         @summarize(b = mean(b))
       end
5×2 DataFrame
 Row │ a     b       
     │ Char  Float64 
─────┼───────────────
   1 │ a         1.0
   2 │ b         2.0
   3 │ c         3.0
   4 │ d         4.0
   5 │ e         5.0

source

# TidierData.@head — Macro.

   @head(df, value)

Shows the first n rows of the the data frame or of each group in a grouped data frame.

Arguments

df: The data frame.
value: number of rows to be returned. Defaults to 6 if left blank.

Examples

julia> df = DataFrame(a = vcat(repeat(["a"], inner = 4),
                                  repeat(["b"], inner = 4)),
                             b = 1:8)
8×2 DataFrame
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ a           1
   2 │ a           2
   3 │ a           3
   4 │ a           4
   5 │ b           5
   6 │ b           6
   7 │ b           7
   8 │ b           8

julia> @head(df, 3)
3×2 DataFrame
 Row │ a        b     
     │ String?  Int64 
─────┼────────────────
   1 │ a            1
   2 │ a            2
   3 │ a            3

julia> @head(df)
6×2 DataFrame
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ a           1
   2 │ a           2
   3 │ a           3
   4 │ a           4
   5 │ b           5
   6 │ b           6

julia> @chain df begin
         @group_by a
         @head 2
       end
GroupedDataFrame with 2 groups based on key: a
First Group (2 rows): a = "a"
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ a           1
   2 │ a           2
⋮
Last Group (2 rows): a = "b"
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ b           5
   2 │ b           6

source

# TidierData.@inner_join — Macro.

@inner_join(df1, df2, [by])

Perform a inner join on df1 and df2 with an optional by.

Arguments

df1: A DataFrame.
df2: A DataFrame.
by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

Examples

julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);

julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);

julia> @inner_join(df1, df2)
1×3 DataFrame
 Row │ a       b      c     
     │ String  Int64  Int64 
─────┼──────────────────────
   1 │ a           1      3

julia> @inner_join(df1, df2, a)
1×3 DataFrame
 Row │ a       b      c     
     │ String  Int64  Int64 
─────┼──────────────────────
   1 │ a           1      3

julia> @inner_join(df1, df2, a = a)
1×3 DataFrame
 Row │ a       b      c     
     │ String  Int64  Int64 
─────┼──────────────────────
   1 │ a           1      3

julia> @inner_join(df1, df2, "a")
1×3 DataFrame
 Row │ a       b      c     
     │ String  Int64  Int64 
─────┼──────────────────────
   1 │ a           1      3

julia> @inner_join(df1, df2, "a" = "a")
1×3 DataFrame
 Row │ a       b      c     
     │ String  Int64  Int64 
─────┼──────────────────────
   1 │ a           1      3

source

# TidierData.@left_join — Macro.

@left_join(df1, df2, [by])

Perform a left join on df1 and df2 with an optional by.

Arguments

df1: A DataFrame.
df2: A DataFrame.
by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

Examples

julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);

julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);

julia> @left_join(df1, df2)
2×3 DataFrame
 Row │ a       b      c       
     │ String  Int64  Int64?  
─────┼────────────────────────
   1 │ a           1        3
   2 │ b           2  missing 

julia> @left_join(df1, df2, a)
2×3 DataFrame
 Row │ a       b      c       
     │ String  Int64  Int64?  
─────┼────────────────────────
   1 │ a           1        3
   2 │ b           2  missing

julia> @left_join(df1, df2, a = a)
2×3 DataFrame
 Row │ a       b      c       
     │ String  Int64  Int64?  
─────┼────────────────────────
   1 │ a           1        3
   2 │ b           2  missing

julia> @left_join(df1, df2, "a")
2×3 DataFrame
 Row │ a       b      c       
     │ String  Int64  Int64?  
─────┼────────────────────────
   1 │ a           1        3
   2 │ b           2  missing

julia> @left_join(df1, df2, "a" = "a")
2×3 DataFrame
 Row │ a       b      c       
     │ String  Int64  Int64?  
─────┼────────────────────────
   1 │ a           1        3
   2 │ b           2  missing

source

# TidierData.@mutate — Macro.

@mutate(df, exprs...)

Create new columns as functions of existing columns. The results have the same number of rows as df.

Arguments

df: A DataFrame.
exprs...: add new columns or replace values of existed columns using new_variable = values syntax.

Examples

julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);

julia> @chain df begin
         @mutate(d = b + c,
                 b_minus_mean_b = b - mean(b))
       end
5×5 DataFrame
 Row │ a     b      c      d      b_minus_mean_b 
     │ Char  Int64  Int64  Int64  Float64        
─────┼───────────────────────────────────────────
   1 │ a         1     11     12            -2.0
   2 │ b         2     12     14            -1.0
   3 │ c         3     13     16             0.0
   4 │ d         4     14     18             1.0
   5 │ e         5     15     20             2.0

julia> @chain df begin
         @mutate begin
           d = b + c
           b_minus_mean_b = b - mean(b)
         end
       end
5×5 DataFrame
 Row │ a     b      c      d      b_minus_mean_b 
     │ Char  Int64  Int64  Int64  Float64        
─────┼───────────────────────────────────────────
   1 │ a         1     11     12            -2.0
   2 │ b         2     12     14            -1.0
   3 │ c         3     13     16             0.0
   4 │ d         4     14     18             1.0
   5 │ e         5     15     20             2.0

julia> @chain df begin
         @mutate(d = b in (1,3))
       end
5×4 DataFrame
 Row │ a     b      c      d     
     │ Char  Int64  Int64  Bool  
─────┼───────────────────────────
   1 │ a         1     11   true
   2 │ b         2     12  false
   3 │ c         3     13   true
   4 │ d         4     14  false
   5 │ e         5     15  false

julia> @chain df begin
         @mutate(across((b, c), mean))
       end
5×5 DataFrame
 Row │ a     b      c      b_mean   c_mean  
     │ Char  Int64  Int64  Float64  Float64 
─────┼──────────────────────────────────────
   1 │ a         1     11      3.0     13.0
   2 │ b         2     12      3.0     13.0
   3 │ c         3     13      3.0     13.0
   4 │ d         4     14      3.0     13.0
   5 │ e         5     15      3.0     13.0

julia> @chain df begin
         @summarize(across(contains("b"), mean))
       end
1×1 DataFrame
 Row │ b_mean  
     │ Float64 
─────┼─────────
   1 │     3.0

julia> @chain df begin
         @summarize(across(-contains("a"), mean))
       end
1×2 DataFrame
 Row │ b_mean   c_mean  
     │ Float64  Float64 
─────┼──────────────────
   1 │     3.0     13.0

julia> @chain df begin
         @mutate(across(where(is_number), minimum))
       end
5×5 DataFrame
 Row │ a     b      c      b_minimum  c_minimum 
     │ Char  Int64  Int64  Int64      Int64     
─────┼──────────────────────────────────────────
   1 │ a         1     11          1         11
   2 │ b         2     12          1         11
   3 │ c         3     13          1         11
   4 │ d         4     14          1         11
   5 │ e         5     15          1         11

source

# TidierData.@nest — Macro.

@nest(df, new_column = nesting_columns)

Multiple columns are nested into one or more new columns in a DataFrame.

Arguments

df: A DataFrame
new_column: New column name
nesting_columns: Columns to be nested into the new_column

Examples

julia> df = DataFrame(a = repeat('a':'e', inner = 3),
                      b = 1:15,
                      c_1 = 16:30,
                      c_2 = 31:45);

julia> @nest(df, data = b:c_2)
5×2 DataFrame
 Row │ a     data          
     │ Char  DataFrame     
─────┼─────────────────────
   1 │ a     3×3 DataFrame 
   2 │ b     3×3 DataFrame 
   3 │ c     3×3 DataFrame 
   4 │ d     3×3 DataFrame 
   5 │ e     3×3 DataFrame 

julia> @nest(df, data_1 = b, data_2 = starts_with("c"))
5×3 DataFrame
 Row │ a     data_1         data_2        
     │ Char  DataFrame      DataFrame     
─────┼────────────────────────────────────
   1 │ a     3×1 DataFrame  3×2 DataFrame 
   2 │ b     3×1 DataFrame  3×2 DataFrame 
   3 │ c     3×1 DataFrame  3×2 DataFrame 
   4 │ d     3×1 DataFrame  3×2 DataFrame 
   5 │ e     3×1 DataFrame  3×2 DataFrame 

julia> @chain df begin
         @nest(data = b:c_2)
         @unnest_longer(data)
       end
15×2 DataFrame
 Row │ a     data                         
     │ Char  NamedTup…                    
─────┼────────────────────────────────────
   1 │ a     (b = 1, c_1 = 16, c_2 = 31)
   2 │ a     (b = 2, c_1 = 17, c_2 = 32)
   3 │ a     (b = 3, c_1 = 18, c_2 = 33)
   4 │ b     (b = 4, c_1 = 19, c_2 = 34)
   5 │ b     (b = 5, c_1 = 20, c_2 = 35)
   6 │ b     (b = 6, c_1 = 21, c_2 = 36)
   7 │ c     (b = 7, c_1 = 22, c_2 = 37)
   8 │ c     (b = 8, c_1 = 23, c_2 = 38)
   9 │ c     (b = 9, c_1 = 24, c_2 = 39)
  10 │ d     (b = 10, c_1 = 25, c_2 = 40)
  11 │ d     (b = 11, c_1 = 26, c_2 = 41)
  12 │ d     (b = 12, c_1 = 27, c_2 = 42)
  13 │ e     (b = 13, c_1 = 28, c_2 = 43)
  14 │ e     (b = 14, c_1 = 29, c_2 = 44)
  15 │ e     (b = 15, c_1 = 30, c_2 = 45)

julia> @chain df begin
         @nest(data = b:c_2)
         @unnest_wider(data)
       end
5×4 DataFrame
 Row │ a     b             c_1           c_2          
     │ Char  Any           Any           Any          
─────┼────────────────────────────────────────────────
   1 │ a     [1, 2, 3]     [16, 17, 18]  [31, 32, 33]
   2 │ b     [4, 5, 6]     [19, 20, 21]  [34, 35, 36]
   3 │ c     [7, 8, 9]     [22, 23, 24]  [37, 38, 39]
   4 │ d     [10, 11, 12]  [25, 26, 27]  [40, 41, 42]
   5 │ e     [13, 14, 15]  [28, 29, 30]  [43, 44, 45]

julia> @chain df begin
         @nest(data = -a)
         @unnest_wider(data) # wider first
         @unnest_longer(-a)  # then longer
       end
15×4 DataFrame
 Row │ a     b      c_1    c_2   
     │ Char  Int64  Int64  Int64 
─────┼───────────────────────────
   1 │ a         1     16     31
   2 │ a         2     17     32
   3 │ a         3     18     33
   4 │ b         4     19     34
   5 │ b         5     20     35
   6 │ b         6     21     36
   7 │ c         7     22     37
   8 │ c         8     23     38
   9 │ c         9     24     39
  10 │ d        10     25     40
  11 │ d        11     26     41
  12 │ d        12     27     42
  13 │ e        13     28     43
  14 │ e        14     29     44
  15 │ e        15     30     45

julia> @chain df begin
         @nest(data = -a)
         @unnest_longer(data) # longer first
         @unnest_wider(-a)    # then wider
       end
15×4 DataFrame
 Row │ a     b      c_2    c_1   
     │ Char  Int64  Int64  Int64 
─────┼───────────────────────────
   1 │ a         1     31     16
   2 │ a         2     32     17
   3 │ a         3     33     18
   4 │ b         4     34     19
   5 │ b         5     35     20
   6 │ b         6     36     21
   7 │ c         7     37     22
   8 │ c         8     38     23
   9 │ c         9     39     24
  10 │ d        10     40     25
  11 │ d        11     41     26
  12 │ d        12     42     27
  13 │ e        13     43     28
  14 │ e        14     44     29
  15 │ e        15     45     30

source

# TidierData.@pivot_longer — Macro.

@pivotlonger(df, cols, [namesto], [values_to])

Reshapes the DataFrame to make it longer, increasing the number of rows and reducing the number of columns.

Arguments

df: A DataFrame.
cols: Columns to pivot into longer format. Multiple columns can be selected but providing tuples of columns is not yet supported.
names_to: Optional, defaults to variable. The name of the newly created column whose values will contain the input DataFrame's column names.
values_to: Optional, defaults to value. The name of the newly created column containing the input DataFrame's cell values.

Examples

julia> df_wide = DataFrame(id = [1, 2], A = [1, 3], B = [2, 4]);

julia> @pivot_longer(df_wide, A:B)
4×3 DataFrame
 Row │ id     variable  value 
     │ Int64  String    Int64
─────┼────────────────────────
   1 │     1  A             1
   2 │     2  A             3
   3 │     1  B             2
   4 │     2  B             4

julia> @pivot_longer(df_wide, -id)
4×3 DataFrame
 Row │ id     variable  value 
     │ Int64  String    Int64
─────┼────────────────────────
   1 │     1  A             1
   2 │     2  A             3
   3 │     1  B             2
   4 │     2  B             4

julia> @pivot_longer(df_wide, A:B, names_to = "letter", values_to = "number")
4×3 DataFrame
 Row │ id     letter  number 
     │ Int64  String  Int64
─────┼───────────────────────
   1 │     1  A            1
   2 │     2  A            3
   3 │     1  B            2
   4 │     2  B            4

julia> @pivot_longer(df_wide, A:B, names_to = letter, values_to = number)
4×3 DataFrame
 Row │ id     letter  number 
     │ Int64  String  Int64
─────┼───────────────────────
   1 │     1  A            1
   2 │     2  A            3
   3 │     1  B            2
   4 │     2  B            4

julia> @pivot_longer(df_wide, A:B, names_to = "letter")
4×3 DataFrame
 Row │ id     letter  value 
     │ Int64  String  Int64
─────┼──────────────────────
   1 │     1  A           1
   2 │     2  A           3
   3 │     1  B           2
   4 │     2  B           4

source

# TidierData.@pivot_wider — Macro.

@pivotwider(df, namesfrom, valuesfrom[, valuesfill])

Reshapes the DataFrame to make it wider, increasing the number of columns and reducing the number of rows.

Arguments

df: A DataFrame.
names_from: The name of the column to get the name of the output columns from.
values_from: The name of the column to get the cell values from.
values_fill: The value to replace a missing name/value combination (default is missing)

Examples

julia> df_long = DataFrame(id = [1, 1, 2, 2],
                           variable = ["A", "B", "A", "B"],
                           value = [1, 2, 3, 4]);

julia> df_long_missing = DataFrame(id = [1, 1, 2],
                           variable = ["A", "B", "B"],
                           value = [1, 2, 4]);

julia> @pivot_wider(df_long, names_from = variable, values_from = value)
2×3 DataFrame
 Row │ id     A       B      
     │ Int64  Int64?  Int64?
─────┼───────────────────────
   1 │     1       1       2
   2 │     2       3       4

julia> @pivot_wider(df_long, names_from = "variable", values_from = "value")
2×3 DataFrame
 Row │ id     A       B      
     │ Int64  Int64?  Int64?
─────┼───────────────────────
   1 │     1       1       2
   2 │     2       3       4

julia> @pivot_wider(df_long_missing, names_from = variable, values_from = value, values_fill = 0)
2×3 DataFrame
 Row │ id     A      B     
     │ Int64  Int64  Int64
─────┼─────────────────────
   1 │     1      1      2
   2 │     2      0      4

source

# TidierData.@pull — Macro.

@pull(df, column)

Pull (or extract) a column as a vector.

Arguments

df: A DataFrame.
column: A single column, referred to either by its name or number.

Examples

julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);

julia> @chain df @pull(a)
5-element Vector{Char}:
 'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase)
 'b': ASCII/Unicode U+0062 (category Ll: Letter, lowercase)
 'c': ASCII/Unicode U+0063 (category Ll: Letter, lowercase)
 'd': ASCII/Unicode U+0064 (category Ll: Letter, lowercase)
 'e': ASCII/Unicode U+0065 (category Ll: Letter, lowercase)

julia> @chain df @pull(2)
5-element Vector{Int64}:
 1
 2
 3
 4
 5

source

# TidierData.@relocate — Macro.

@relocate(df, columns, before = nothing, after = nothing)

Rearranges the columns of a data frame. This function allows for moving specified columns to a new position within the data frame, either before or after a given target column. The columns, before, and after arguments all accept tidy selection functions. Only one of before or after should be specified. If neither are specified, the selected columns will be moved to the beginning of the data frame.

Arguments

df: The data frame.
columns: Column or columns to to be moved.
before: (Optional) Column or columns before which the specified columns will be moved. If not provided or nothing, this argument is ignored.
after: (Optional) Column or columns after which the specified columns will be moved. If not provided or nothing, this argument is ignored.

Examples

julia> df = DataFrame(A = 1:5, B = 6:10, C = ["A", "b", "C", "D", "E"], D = ['A', 'B','A', 'B','C'],
                      E = 1:5, F = ["A", "b", "C", "D", "E"]);

julia> @relocate(df, where(is_string), before = where(is_integer))
5×6 DataFrame
 Row │ C       F       A      B      E      D    
     │ String  String  Int64  Int64  Int64  Char 
─────┼───────────────────────────────────────────
   1 │ A       A           1      6      1  A
   2 │ b       b           2      7      2  B
   3 │ C       C           3      8      3  A
   4 │ D       D           4      9      4  B
   5 │ E       E           5     10      5  C


julia> @relocate(df, B, C, D, after = E)
5×6 DataFrame
 Row │ A      E      B      C       D     F      
     │ Int64  Int64  Int64  String  Char  String 
─────┼───────────────────────────────────────────
   1 │     1      1      6  A       A     A
   2 │     2      2      7  b       B     b
   3 │     3      3      8  C       A     C
   4 │     4      4      9  D       B     D
   5 │     5      5     10  E       C     E

julia> @relocate(df, B, C, D, after = starts_with("E"))
5×6 DataFrame
 Row │ A      E      B      C       D     F      
     │ Int64  Int64  Int64  String  Char  String 
─────┼───────────────────────────────────────────
   1 │     1      1      6  A       A     A
   2 │     2      2      7  b       B     b
   3 │     3      3      8  C       A     C
   4 │     4      4      9  D       B     D
   5 │     5      5     10  E       C     E

julia> @relocate(df, B:C) # bring columns to the front
5×6 DataFrame
 Row │ B      C       A      D     E      F      
     │ Int64  String  Int64  Char  Int64  String 
─────┼───────────────────────────────────────────
   1 │     6  A           1  A         1  A
   2 │     7  b           2  B         2  b
   3 │     8  C           3  A         3  C
   4 │     9  D           4  B         4  D
   5 │    10  E           5  C         5  E

source

# TidierData.@rename — Macro.

@rename(df, exprs...)

Change the names of individual column names in a DataFrame. Users can also use @select() to rename and select columns.

Arguments

df: A DataFrame.
exprs...: Use new_name = old_name syntax to rename selected columns.

Examples

julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);

julia> @chain df begin
         @rename(d = b, e = c)
       end
5×3 DataFrame
 Row │ a     d      e     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ b         2     12
   3 │ c         3     13
   4 │ d         4     14
   5 │ e         5     15

source

# TidierData.@rename_with — Macro.

 @rename_with(df, fn, exprs...)

Renames the chosen column names using a function

Arguments

df: a DataFrame
fn: desired function to (such as strremoveall from TidierStrings)
exprs: One or more unquoted variable names separated by commas. Variable names

can also be used as their positions in the data, like x:y, to select a range of variables. Variables names can also be chosen with starts with. Defaults to all columns if empty.

Examples

julia> function str_remove_all(column, pattern::String)
         if ismissing(column)
             return column
         end
         patterns = split(pattern, '|')
         for p in patterns
             column = replace(column, strip(p) => "")
         end
         return column
       end;

julia> df = DataFrame(
              term_a = ["apple", "banana", "cherry"],
              document_a = ["doc_1", "doc2", "doc3"],
              _n_ = [1, 2, 3]
            ); 

julia> @rename_with(df, str -> str_remove_all(str, "_a"), !term_a)
3×3 DataFrame
 Row │ term_a  document  _n_   
     │ String  String    Int64 
─────┼─────────────────────────
   1 │ apple   doc_1         1
   2 │ banana  doc2          2
   3 │ cherry  doc3          3

source

# TidierData.@right_join — Macro.

@right_join(df1, df2, [by])

Perform a right join on df1 and df2 with an optional by.

Arguments

df1: A DataFrame.
df2: A DataFrame.
by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

Examples

julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);

julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);

julia> @right_join(df1, df2)
2×3 DataFrame
 Row │ a       b        c     
     │ String  Int64?   Int64 
─────┼────────────────────────
   1 │ a             1      3
   2 │ c       missing      4

julia> @right_join(df1, df2, a)
2×3 DataFrame
 Row │ a       b        c     
     │ String  Int64?   Int64 
─────┼────────────────────────
   1 │ a             1      3
   2 │ c       missing      4

julia> @right_join(df1, df2, a = a)
2×3 DataFrame
 Row │ a       b        c     
     │ String  Int64?   Int64 
─────┼────────────────────────
   1 │ a             1      3
   2 │ c       missing      4

julia> @right_join(df1, df2, "a")
2×3 DataFrame
 Row │ a       b        c     
     │ String  Int64?   Int64 
─────┼────────────────────────
   1 │ a             1      3
   2 │ c       missing      4

julia> @right_join(df1, df2, "a" = "a")
2×3 DataFrame
 Row │ a       b        c     
     │ String  Int64?   Int64 
─────┼────────────────────────
   1 │ a             1      3
   2 │ c       missing      4

source

# TidierData.@select — Macro.

@select(df, exprs...)

Select variables in a DataFrame.

Arguments

df: A DataFrame.
exprs...: One or more unquoted variable names separated by commas. Variable names can also be used as their positions in the data, like x:y, to select a range of variables.

Examples

julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);

julia> @chain df @select(a, b, c)
5×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ b         2     12
   3 │ c         3     13
   4 │ d         4     14
   5 │ e         5     15

julia> @chain df @select(a:b)
5×2 DataFrame
 Row │ a     b     
     │ Char  Int64 
─────┼─────────────
   1 │ a         1
   2 │ b         2
   3 │ c         3
   4 │ d         4
   5 │ e         5

julia> @chain df @select(1:2)
5×2 DataFrame
 Row │ a     b     
     │ Char  Int64 
─────┼─────────────
   1 │ a         1
   2 │ b         2
   3 │ c         3
   4 │ d         4
   5 │ e         5

julia> @chain df @select(-(a:b))
5×1 DataFrame
 Row │ c     
     │ Int64 
─────┼───────
   1 │    11
   2 │    12
   3 │    13
   4 │    14
   5 │    15

julia> @chain df @select(!(a:b))
5×1 DataFrame
 Row │ c     
     │ Int64 
─────┼───────
   1 │    11
   2 │    12
   3 │    13
   4 │    14
   5 │    15

julia> @chain df @select(-(a, b))
5×1 DataFrame
 Row │ c     
     │ Int64 
─────┼───────
   1 │    11
   2 │    12
   3 │    13
   4 │    14
   5 │    15

julia> @chain df @select(!(a, b))
5×1 DataFrame
 Row │ c     
     │ Int64 
─────┼───────
   1 │    11
   2 │    12
   3 │    13
   4 │    14
   5 │    15

julia> @chain df begin
         @select(contains("b"), starts_with("c"))
       end
5×2 DataFrame
 Row │ b      c     
     │ Int64  Int64 
─────┼──────────────
   1 │     1     11
   2 │     2     12
   3 │     3     13
   4 │     4     14
   5 │     5     15

julia> @chain df @select(-(1:2))
5×1 DataFrame
 Row │ c     
     │ Int64 
─────┼───────
   1 │    11
   2 │    12
   3 │    13
   4 │    14
   5 │    15

julia> @chain df @select(!(1:2))
5×1 DataFrame
 Row │ c     
     │ Int64 
─────┼───────
   1 │    11
   2 │    12
   3 │    13
   4 │    14
   5 │    15

julia> @chain df @select(-c)
5×2 DataFrame
 Row │ a     b     
     │ Char  Int64 
─────┼─────────────
   1 │ a         1
   2 │ b         2
   3 │ c         3
   4 │ d         4
   5 │ e         5

julia> @chain df begin
         @select(-contains("a"))
       end
5×2 DataFrame
 Row │ b      c     
     │ Int64  Int64 
─────┼──────────────
   1 │     1     11
   2 │     2     12
   3 │     3     13
   4 │     4     14
   5 │     5     15

julia> @chain df begin
         @select(!contains("a"))
       end
5×2 DataFrame
 Row │ b      c     
     │ Int64  Int64 
─────┼──────────────
   1 │     1     11
   2 │     2     12
   3 │     3     13
   4 │     4     14
   5 │     5     15

julia> @chain df begin
         @select(where(is_number))
       end
5×2 DataFrame
 Row │ b      c     
     │ Int64  Int64 
─────┼──────────────
   1 │     1     11
   2 │     2     12
   3 │     3     13
   4 │     4     14
   5 │     5     15

source

# TidierData.@semi_join — Macro.

@semi_join(df1, df2, [by])

Perform an semi-join on df1 and df2 with an optional by.

Arguments

df1: A DataFrame.
df2: A DataFrame.
by: An optional column or tuple of columns. by supports interpolation of individual columns. If by is not supplied, then it will be inferred from shared names of columns between df1 and df2.

Examples

julia> df1 = DataFrame(a = ["a", "b"], b = 1:2);

julia> df2 = DataFrame(a = ["a", "c"], c = 3:4);

julia> @semi_join(df1, df2)
1×2 DataFrame
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ a           1

julia> @semi_join(df1, df2, a)
1×2 DataFrame
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ a           1

julia> @semi_join(df1, df2, a = a)
1×2 DataFrame
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ a           1

julia> @semi_join(df1, df2, "a")
1×2 DataFrame
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ a           1

julia> @semi_join(df1, df2, "a" = "a")
1×2 DataFrame
 Row │ a       b     
     │ String  Int64 
─────┼───────────────
   1 │ a           1

source

# TidierData.@separate — Macro.

@separate(df, from, into, sep, extra = "merge")

Separate a string column into mulitiple new columns based on a specified delimter

Arguments

df: A DataFrame
from: Column that will be split
into: New column names, supports [] or ()
sep: the string or character on which to split
extra: "merge", "warn" and "drop" . If not enough columns are provided, extra determines whether additional entries will be merged into the final one or dropped. "warn" generates a warning message for dropped values.

Examples

julia> df = DataFrame(a = ["1-1", "2-2", "3-3-3"]);

julia> @separate(df, a, [b, c, d], "-")
3×3 DataFrame
 Row │ b          c          d          
     │ SubStrin…  SubStrin…  SubStrin…? 
─────┼──────────────────────────────────
   1 │ 1          1          missing    
   2 │ 2          2          missing    
   3 │ 3          3          3

julia> @chain df begin
         @separate(a, (b, c, d), "-")
       end
3×3 DataFrame
 Row │ b          c          d          
     │ SubStrin…  SubStrin…  SubStrin…? 
─────┼──────────────────────────────────
   1 │ 1          1          missing    
   2 │ 2          2          missing    
   3 │ 3          3          3

julia> @separate(df, a, (b, c), "-")
3×2 DataFrame
 Row │ b          c      
     │ SubStrin…  String 
─────┼───────────────────
   1 │ 1          1
   2 │ 2          2
   3 │ 3          3-3

julia> @chain df begin
         @separate(a, (b, c), "-", extra = "drop")
       end
3×2 DataFrame
 Row │ b          c         
     │ SubStrin…  SubStrin… 
─────┼──────────────────────
   1 │ 1          1
   2 │ 2          2
   3 │ 3          3

source

# TidierData.@separate_rows — Macro.

separate_rows(df, columns..., sep)

Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter.

Arguments

df: A DataFrame
columns: A column or multiple columns to be split. Can be a mix of integers and column names.
sep: The string or character or regular expression used to split the column values.

Examples

julia> df = DataFrame(a = 1:3,
                      b = ["a", "aa;bb;cc", "dd;ee"],
                      c = ["1", "2;3;4", "5;6"],
                      d = ["7", "8;9;10", "11;12"])
3×4 DataFrame
 Row │ a      b         c       d      
     │ Int64  String    String  String 
─────┼─────────────────────────────────
   1 │     1  a         1       7
   2 │     2  aa;bb;cc  2;3;4   8;9;10
   3 │     3  dd;ee     5;6     11;12

julia> @separate_rows(df, 2, 4, ";")
6×4 DataFrame
 Row │ a      b          c       d         
     │ Int64  SubStrin…  String  SubStrin… 
─────┼─────────────────────────────────────
   1 │     1  a          1       7
   2 │     2  aa         2;3;4   8
   3 │     2  bb         2;3;4   9
   4 │     2  cc         2;3;4   10
   5 │     3  dd         5;6     11
   6 │     3  ee         5;6     12

julia> @separate_rows(df, b:d, ";")
6×4 DataFrame
 Row │ a      b          c          d         
     │ Int64  SubStrin…  SubStrin…  SubStrin… 
─────┼────────────────────────────────────────
   1 │     1  a          1          7
   2 │     2  aa         2          8
   3 │     2  bb         3          9
   4 │     2  cc         4          10
   5 │     3  dd         5          11
   6 │     3  ee         6          12

source

# TidierData.@slice — Macro.

@slice(df, exprs...)

Select, remove or duplicate rows by indexing their integer positions.

Arguments

df: A DataFrame.
exprs...: integer row values. Use positive values to keep the rows, or negative values to drop. Values provided must be either all positive or all negative, and they must be within the range of DataFrames' row numbers.

Examples

julia> df = DataFrame(a = repeat('a':'c', inner = 3), b = 1:9, c = 11:19);

julia> @chain df @slice(1:5)
5×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ a         2     12
   3 │ a         3     13
   4 │ b         4     14
   5 │ b         5     15

julia> @chain df @slice(-(1:2))
7×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         3     13
   2 │ b         4     14
   3 │ b         5     15
   4 │ b         6     16
   5 │ c         7     17
   6 │ c         8     18
   7 │ c         9     19

julia> @chain df begin
         @group_by(a)
         @slice(1)
         @ungroup
       end
3×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ b         4     14
   3 │ c         7     17

julia> @chain df begin
         @group_by(a)
         @slice(n())
         @ungroup
       end
3×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         3     13
   2 │ b         6     16
   3 │ c         9     19

julia> @chain df begin
         @group_by(a)
         @slice(-n())
         @ungroup
       end
6×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ a         2     12
   3 │ b         4     14
   4 │ b         5     15
   5 │ c         7     17
   6 │ c         8     18

julia> @chain df begin
         @group_by(a)
         @slice(-(2:n()))
         @ungroup
       end
3×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ b         4     14
   3 │ c         7     17

source

# TidierData.@slice_head — Macro.

@slice_head(df; n, prop)

Retrieve rows from the beginning of a DataFrame or GroupedDataFrame.

Arguments

df: The source data frame or grouped data frame from which to slice rows.
prop: The proportion of rows to slice.
n: An optional integer argument to specify the number of rows at the beginning of the dataframe to retrieve. Defaults to 1.

Examples

julia> df = DataFrame(
           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],
           b = [0.3, 2, missing, 0.3, 6, 5, 7, 7],
           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);

julia> @chain df begin
         @slice_head(n = 3)
       end 
3×3 DataFrame
 Row │ a          b          c        
     │ Float64?   Float64?   Float64? 
─────┼────────────────────────────────
   1 │ missing          0.3       0.2
   2 │       0.2        2.0       0.2
   3 │ missing    missing         0.2

julia> @chain df begin
         @slice_head(prop = 0.25)
       end 
2×3 DataFrame
 Row │ a          b         c        
     │ Float64?   Float64?  Float64? 
─────┼───────────────────────────────
   1 │ missing         0.3       0.2
   2 │       0.2       2.0       0.2

source

# TidierData.@slice_max — Macro.

@slice_max(df, column; with_ties = true, n, prop, missing_rm = true)

Retrieve rows with the maximum value(s) from the specified column of a DataFrame or GroupedDataFrame.

Arguments

df: The source data frame or grouped data frame from which to slice rows.
column: The column for which to slice the maximum values.
with_ties: Whether or not all ties will be shown, defaults to true. When false it will only show the first row.
prop: The proportion of rows to slice.
n: An optional integer argument to specify the number of maximum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden.
missing_rm: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice.

Examples

julia> df = DataFrame(
           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],
           b = [0.3, 2, missing, 3, 6, 5, 7, 7],
           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);

julia> @chain df begin
         @slice_max(b)
       end 
2×3 DataFrame
 Row │ a         b         c        
     │ Float64?  Float64?  Float64? 
─────┼──────────────────────────────
   1 │      5.0       7.0       5.0
   2 │      6.0       7.0       6.0

julia> @chain df begin
         @slice_max(b, with_ties = false)
       end 
1×3 DataFrame
 Row │ a         b         c        
     │ Float64?  Float64?  Float64? 
─────┼──────────────────────────────
   1 │      5.0       7.0       5.0

julia> @chain df begin
         @slice_max(b, n = 3)
       end 
3×3 DataFrame
 Row │ a         b         c        
     │ Float64?  Float64?  Float64? 
─────┼──────────────────────────────
   1 │      5.0       7.0       5.0
   2 │      6.0       7.0       6.0
   3 │      1.0       6.0       1.0

julia> @chain df begin
         @slice_max(b, prop = 0.5, missing_rm = true)
       end
3×3 DataFrame
 Row │ a         b         c        
     │ Float64?  Float64?  Float64? 
─────┼──────────────────────────────
   1 │      5.0       7.0       5.0
   2 │      6.0       7.0       6.0
   3 │      1.0       6.0       1.0

source

# TidierData.@slice_min — Macro.

@slice_min(df, column; with_ties = true, n, prop, missing_rm = true)

Retrieve rows with the minimum value(s) from the specified column of a DataFrame or GroupedDataFrame.

Arguments

df: The source data frame or grouped data frame from which to slice rows.
column: The column for which to slice the minimum values.
with_ties: Whether or not all ties will be shown, defaults to true and shows all ties. When false it will only show the first row.
prop: The proportion of rows to slice.
n: An optional integer argument to specify the number of minimum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden.
missing_rm: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice.

Examples

julia> df = DataFrame(
           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],
           b = [0.3, 2, missing, 0.3, 6, 5, 7, 7],
           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);

julia> @chain df begin
         @slice_min(b)
       end 
2×3 DataFrame
 Row │ a         b         c         
     │ Float64?  Float64?  Float64?  
─────┼───────────────────────────────
   1 │  missing       0.3        0.2
   2 │  missing       0.3  missing

julia> @chain df begin
         @slice_min(b, with_ties = false)
       end 
1×3 DataFrame
 Row │ a         b         c        
     │ Float64?  Float64?  Float64? 
─────┼──────────────────────────────
   1 │  missing       0.3       0.2

julia> @chain df begin
         @slice_min(b, n = 3)
       end
3×3 DataFrame
 Row │ a          b         c         
     │ Float64?   Float64?  Float64?  
─────┼────────────────────────────────
   1 │ missing         0.3        0.2
   2 │ missing         0.3  missing   
   3 │       0.2       2.0        0.2  

julia> @chain df begin
         @slice_min(b, prop = 0.5, missing_rm = true)
       end
3×3 DataFrame
 Row │ a          b         c         
     │ Float64?   Float64?  Float64?  
─────┼────────────────────────────────
   1 │ missing         0.3        0.2
   2 │ missing         0.3  missing   
   3 │       0.2       2.0        0.2

source

# TidierData.@slice_sample — Macro.

@slice_sample(df, [n = 1, prop, replace = false])

Randomly sample rows from a DataFrame df or from each group in a GroupedDataFrame. The default is to return 1 row. Either the number of rows (n) or the proportion of rows (prop) should be provided as a keyword argument.

Arguments

df: The source data frame or grouped data frame from which to sample rows.
n: The number of rows to sample. Defaults to 1.
prop: The proportion of rows to sample.
replace: Whether to sample with replacement. Defaults to false.

Examples

julia> df = DataFrame(a = 1:10, b = 11:20);

julia> using StableRNGs, Random

julia> rng = StableRNG(1);

julia> Random.seed!(rng, 1);

julia> @chain df begin 
         @slice_sample(n = 5)
       end
5×2 DataFrame
 Row │ a      b     
     │ Int64  Int64 
─────┼──────────────
   1 │     6     16
   2 │     1     11
   3 │     5     15
   4 │     4     14
   5 │     8     18

julia> @chain df begin 
         @slice_sample(n = 5, replace = true)
       end
5×2 DataFrame
 Row │ a      b     
     │ Int64  Int64 
─────┼──────────────
   1 │     7     17
   2 │     2     12
   3 │     1     11
   4 │     4     14
   5 │     2     12

julia> @chain df begin 
         @slice_sample(prop = 0.5)
       end
5×2 DataFrame
 Row │ a      b     
     │ Int64  Int64 
─────┼──────────────
   1 │     6     16
   2 │     7     17
   3 │     5     15
   4 │     9     19
   5 │     2     12

julia> @chain df begin 
         @slice_sample(prop = 0.5, replace = true)
       end
5×2 DataFrame
 Row │ a      b     
     │ Int64  Int64 
─────┼──────────────
   1 │    10     20
   2 │     4     14
   3 │     9     19
   4 │     9     19
   5 │     8     18

source

# TidierData.@slice_tail — Macro.

@slice_tail(df; n, prop)

Retrieve rows from the end of a DataFrame or GroupedDataFrame.

Arguments

df: The source data frame or grouped data frame from which to slice rows.
prop: The proportion of rows to slice.
n: An optional integer argument to specify the number of rows at the beginning of the dataframe to retrieve. Defaults to 1.

Examples

julia> df = DataFrame(
           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],
           b = [0.3, 2, missing, 0.3, 6, 5, 7, 7],
           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);

julia> @chain df begin
         @slice_tail(n = 3)
       end 
3×3 DataFrame
 Row │ a          b         c         
     │ Float64?   Float64?  Float64?  
─────┼────────────────────────────────
   1 │ missing         5.0  missing   
   2 │       5.0       7.0        5.0
   3 │       6.0       7.0        6.0

julia> @chain df begin
         @slice_tail(prop = 0.25)
       end 
2×3 DataFrame
 Row │ a         b         c        
     │ Float64?  Float64?  Float64? 
─────┼──────────────────────────────
   1 │      5.0       7.0       5.0
   2 │      6.0       7.0       6.0

source

# TidierData.@summarise — Macro.

@summarize(df, exprs...)
@summarise(df, exprs...)

Create a new DataFrame with one row that aggregating all observations from the input DataFrame or GroupedDataFrame.

Arguments

df: A DataFrame.
exprs...: a new_variable = function(old_variable) pair. function() should be an aggregate function that returns a single value.

Examples

julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);

julia> @chain df begin
         @summarize(mean_b = mean(b),
                    median_b = median(b))
       end
1×2 DataFrame
 Row │ mean_b   median_b 
     │ Float64  Float64  
─────┼───────────────────
   1 │     3.0       3.0

julia> @chain df begin
         @summarize begin
           mean_b = mean(b)
           median_b = median(b)
         end
       end
1×2 DataFrame
 Row │ mean_b   median_b 
     │ Float64  Float64  
─────┼───────────────────
   1 │     3.0       3.0 

julia> @chain df begin
         @summarise(mean_b = mean(b), median_b = median(b))
       end
1×2 DataFrame
 Row │ mean_b   median_b 
     │ Float64  Float64  
─────┼───────────────────
   1 │     3.0       3.0

julia> @chain df begin
         @summarize(across((b,c), (minimum, maximum)))
       end
1×4 DataFrame
 Row │ b_minimum  c_minimum  b_maximum  c_maximum 
     │ Int64      Int64      Int64      Int64     
─────┼────────────────────────────────────────────
   1 │         1         11          5         15

julia> @chain df begin
         @summarize(across(where(is_number), minimum))
       end
1×2 DataFrame
 Row │ b_minimum  c_minimum 
     │ Int64      Int64     
─────┼──────────────────────
   1 │         1         11

source

# TidierData.@summarize — Macro.

@summarize(df, exprs...)
@summarise(df, exprs...)

Create a new DataFrame with one row that aggregating all observations from the input DataFrame or GroupedDataFrame.

Arguments

df: A DataFrame.
exprs...: a new_variable = function(old_variable) pair. function() should be an aggregate function that returns a single value.

Examples

julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);

julia> @chain df begin
         @summarize(mean_b = mean(b),
                    median_b = median(b))
       end
1×2 DataFrame
 Row │ mean_b   median_b 
     │ Float64  Float64  
─────┼───────────────────
   1 │     3.0       3.0

julia> @chain df begin
         @summarize begin
           mean_b = mean(b)
           median_b = median(b)
         end
       end
1×2 DataFrame
 Row │ mean_b   median_b 
     │ Float64  Float64  
─────┼───────────────────
   1 │     3.0       3.0 

julia> @chain df begin
         @summarise(mean_b = mean(b), median_b = median(b))
       end
1×2 DataFrame
 Row │ mean_b   median_b 
     │ Float64  Float64  
─────┼───────────────────
   1 │     3.0       3.0

julia> @chain df begin
         @summarize(across((b,c), (minimum, maximum)))
       end
1×4 DataFrame
 Row │ b_minimum  c_minimum  b_maximum  c_maximum 
     │ Int64      Int64      Int64      Int64     
─────┼────────────────────────────────────────────
   1 │         1         11          5         15

julia> @chain df begin
         @summarize(across(where(is_number), minimum))
       end
1×2 DataFrame
 Row │ b_minimum  c_minimum 
     │ Int64      Int64     
─────┼──────────────────────
   1 │         1         11

source

# TidierData.@summary — Macro.

   @summary(df, cols...)

For numerical columns, returns a dataframe with the Q1,Q3, min, max, mean, median, number of missing values

Arguments

'df': A DataFrame
cols: columns on which summary will be performed. This is an optional arguement, without which summary will be performed on all numerical columns

Examples

julia> df = DataFrame(a = [1, 2, 3, 4, 5],
                      b = [missing, 7, 8, 9, 10],
                      c = [11, missing, 13, 14, missing],
                      d = [16, 17, 18, 19, 20]);

julia> @summary(df);

julia> @summary(df, (b:d));

julia> @chain df begin
         @summary(b:d)
       end;

source

# TidierData.@tally — Macro.

@tally(df, [wt], [sort])

Tally the unique values of one or more variables, with an optional weighting.

@tally() is a low-level helper macro for @count() that assumes that any grouping has already been performed. @chain @tally() is roughly equivalent to @chain df @summarize(n = n()). Supply wt to perform weighted counts, switching the summary from n = n() to n = sum(wt).

Arguments

df: A DataFrame or GroupedDataFrame.
wt: Optional parameter. Used to calculate a sum over the provided wt variable instead of counting the rows.
sort: Defaults to false. Whether the result should be sorted from highest to lowest n.

Examples

julia> df = DataFrame(a = vcat(repeat(["a"], inner = 3),
                           repeat(["b"], inner = 3),
                           repeat(["c"], inner = 1),
                           missing),
                      b = 1:8)
8×2 DataFrame
 Row │ a        b     
     │ String?  Int64 
─────┼────────────────
   1 │ a            1
   2 │ a            2
   3 │ a            3
   4 │ b            4
   5 │ b            5
   6 │ b            6
   7 │ c            7
   8 │ missing      8

julia> @chain df @tally()
1×1 DataFrame
 Row │ n     
     │ Int64 
─────┼───────
   1 │     8

julia> @chain df begin
         @group_by(a)
         @tally()
       end
4×2 DataFrame
 Row │ a        n     
     │ String?  Int64 
─────┼────────────────
   1 │ a            3
   2 │ b            3
   3 │ c            1
   4 │ missing      1

julia> @chain df begin
         @group_by(a)
         @tally(wt = b)
       end
4×2 DataFrame
 Row │ a        n     
     │ String?  Int64 
─────┼────────────────
   1 │ a            6
   2 │ b           15
   3 │ c            7
   4 │ missing      8

julia> @chain df begin
         @group_by(a)
         @tally(wt = b, sort = true)
       end
4×2 DataFrame
 Row │ a        n     
     │ String?  Int64 
─────┼────────────────
   1 │ b           15
   2 │ missing      8
   3 │ c            7
   4 │ a            6

source

# TidierData.@transmute — Macro.

@transmute(df, exprs...)

Create a new DataFrame with only computed columns.

Arguments

df: A DataFrame.
exprs...: add new columns or replace values of existed columns using new_variable = values syntax.

Examples

julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);

julia> @chain df begin
         @transmute(d = b + c)
       end
5×1 DataFrame
 Row │ d     
     │ Int64 
─────┼───────
   1 │    12
   2 │    14
   3 │    16
   4 │    18
   5 │    20

source

# TidierData.@ungroup — Macro.

@ungroup(df)

Return a DataFrame with all groups removed.

If this is applied to a GroupedDataFrame, then it removes the grouping. If this is applied to a DataFrame (without any groups), then it returns the DataFrame unchanged.

Arguments

df: A GroupedDataFrame or `DataFrame``.

Examples

julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15);

julia> @chain df begin
         @group_by(a)
       end
GroupedDataFrame with 5 groups based on key: a
First Group (1 row): a = 'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase)
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
⋮
Last Group (1 row): a = 'e': ASCII/Unicode U+0065 (category Ll: Letter, lowercase)
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ e         5     15

julia> @chain df begin
         @group_by(a)
         @ungroup
       end
5×3 DataFrame
 Row │ a     b      c     
     │ Char  Int64  Int64 
─────┼────────────────────
   1 │ a         1     11
   2 │ b         2     12
   3 │ c         3     13
   4 │ d         4     14
   5 │ e         5     15

source

# TidierData.@unite — Macro.

  @unite(df, new_cols, from_cols, sep, remove = true)

Separate a multiple columns into one new columns using a specific delimter

Arguments

df: A DataFrame
new_col: New column that will recieve the combination
from_cols: Column names that it will combine, supports [] or ()
sep: the string or character that will separate the values in the new column
remove: defaults to true, removes input columns from data frame

Examples

julia> df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]);

julia> @unite(df, new_col, (b, c, d), "-")
3×1 DataFrame
 Row │ new_col 
     │ String  
─────┼─────────
   1 │ 1-1
   2 │ 2-2
   3 │ 3-3-3

julia> @unite(df, new_col, (b, c, d), "-", remove = false)
3×4 DataFrame
 Row │ b       c       d        new_col 
     │ String  String  String?  String  
─────┼──────────────────────────────────
   1 │ 1       1       missing  1-1
   2 │ 2       2       missing  2-2
   3 │ 3       3       3        3-3-3

source

# TidierData.@unnest_longer — Macro.

@unnest_longer(df, columns, indices_include=false)

Unnest arrays in columns from a DataFrame to create a longer DataFrame with one row for each entry of the array.

Arguments

df: A DataFrame.
columns: Columns to unnest. Can be a column symbols or a range of columns if they align for number of values.
indices_include: Optional. When set to true, adds an index column for each unnested column, which logs the position of each array entry.
keep_empty: Optional. When set to true, rows with empty arrays are kept, not skipped, and unnested as missing.

Examples

julia> df = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
2×3 DataFrame
 Row │ a      b       c      
     │ Int64  Array…  Array… 
─────┼───────────────────────
   1 │     1  [1, 2]  [5, 6]
   2 │     2  [3, 4]  [7, 8]

julia> @unnest_longer(df, 2)
4×3 DataFrame
 Row │ a      b      c      
     │ Int64  Int64  Array… 
─────┼──────────────────────
   1 │     1      1  [5, 6]
   2 │     1      2  [5, 6]
   3 │     2      3  [7, 8]
   4 │     2      4  [7, 8]

julia> @unnest_longer(df, b:c, indices_include = true)
4×5 DataFrame
 Row │ a      b      c      b_id   c_id  
     │ Int64  Int64  Int64  Int64  Int64 
─────┼───────────────────────────────────
   1 │     1      1      5      1      1
   2 │     1      2      6      2      2
   3 │     2      3      7      1      1
   4 │     2      4      8      2      2

julia> df2 = DataFrame(x = 1:4, y = [[], [1, 2, 3], [4, 5], Int[]])
4×2 DataFrame
 Row │ x      y            
     │ Int64  Array…       
─────┼─────────────────────
   1 │     1  Any[]
   2 │     2  Any[1, 2, 3]
   3 │     3  Any[4, 5]
   4 │     4  Any[]

julia> @unnest_longer(df2, y, keep_empty = true)
7×2 DataFrame
 Row │ x      y       
     │ Int64  Any     
─────┼────────────────
   1 │     1  missing 
   2 │     2  1
   3 │     2  2
   4 │     2  3
   5 │     3  4
   6 │     3  5
   7 │     4  missing

source

# TidierData.@unnest_wider — Macro.

@unnest_wider(df, columns, names_sep)

Unnest specified columns of arrays or dictionaries into wider format dataframe with individual columns.

Arguments

df: A DataFrame.
columns: Columns to be unnested. These columns should contain arrays, dictionaries, dataframes, or tuples. Dictionarys headings will be converted to column names.
names_sep: An optional string to specify the separator for creating new column names. If not provided, defaults to no separator.

Examples

julia> df = DataFrame(name = ["Zaki", "Farida"], attributes = [
               Dict("age" => 25, "city" => "New York"),
               Dict("age" => 30, "city" => "Los Angeles")]);

julia> @unnest_wider(df, attributes)
2×3 DataFrame
 Row │ name    city         age   
     │ String  String       Int64 
─────┼────────────────────────────
   1 │ Zaki    New York        25
   2 │ Farida  Los Angeles     30

julia> df2 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
2×3 DataFrame
 Row │ a      b       c      
     │ Int64  Array…  Array… 
─────┼───────────────────────
   1 │     1  [1, 2]  [5, 6]
   2 │     2  [3, 4]  [7, 8]

julia> @unnest_wider(df2, b:c, names_sep = "_")
2×5 DataFrame
 Row │ a      b_1    b_2    c_1    c_2   
     │ Int64  Int64  Int64  Int64  Int64 
─────┼───────────────────────────────────
   1 │     1      1      2      5      6
   2 │     2      3      4      7      8

source

Reference

Index¤

Reference - Exported functions¤

Reference - Internal functions¤