Reusing a Query and Views

While using TidierDB, you may need to generate part of a query and reuse it multiple times. There are two ways to do this

saving the query
@create_view(name)

Setup¤

using TidierDB
con = connect(duckdb());
mtcars_path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv";
mtcars = dt(con, mtcars_path);

Start a query to analyze fuel efficiency by number of cylinders. However, to further build on this query later, end the chain without using @show_query or @collect

query = @chain mtcars begin
    @group_by cyl
    @summarize begin
        across(mpg, (mean, minimum, maximum))
        num_cars = n()
        end
    @mutate begin
        efficiency = case_when(
            mpg_mean >= 25, "High",
            mpg_mean >= 15, "Moderate",
            "Low" )
    end
end;

@chain query begin
   @left_join(mtcars, cyl)
   @group_by(efficiency)
   @summarize(avg_hp = mean(hp))
   @collect
end

@chain query begin
    @summarize(avg_hp = mean(hp))
end

SQLQuery("SELECT AVG(hp) AS avg_hp", "cte_2", "", "", "", "", "", "", true, true, false, 18×4 DataFrame
 Row │ name         type     current_selxn  table_name
     │ String       String   Int64          SubString…
─────┼────────────────────────────────────────────────────────────────────────
   1 │ model        VARCHAR              0  mtcars
   2 │ mpg          DOUBLE               0  mtcars
   3 │ cyl          BIGINT               0  mtcars
   4 │ disp         DOUBLE               0  mtcars
   5 │ hp           BIGINT               0  mtcars
   6 │ drat         DOUBLE               0  mtcars
   7 │ wt           DOUBLE               0  mtcars
   8 │ qsec         DOUBLE               0  mtcars
  ⋮  │      ⋮          ⋮           ⋮                        ⋮
  12 │ carb         BIGINT               0  mtcars
  13 │ mpg_mean     UNKNOWN              0  'https://gist.githubusercontent.…
  14 │ mpg_minimum  UNKNOWN              0  'https://gist.githubusercontent.…
  15 │ mpg_maximum  UNKNOWN              0  'https://gist.githubusercontent.…
  16 │ num_cars     UNKNOWN              0  'https://gist.githubusercontent.…
  17 │ efficiency   UNKNOWN              0  cte_2
  18 │ avg_hp       UNKNOWN              1  cte_2
                                                                3 rows omitted, false, DuckDB.DB(":memory:"), TidierDB.CTE[TidierDB.CTE("cte_1", " SELECT cyl, AVG(mpg) AS mpg_mean, MIN(mpg) AS mpg_minimum, MAX(mpg) AS mpg_maximum, ***COUNT(*)*** AS num_cars FROM 'https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv' AS mtcars  GROUP BY cyl", "", "", "", ""), TidierDB.CTE("cte_2", " cyl, mpg_mean, mpg_minimum, mpg_maximum, num_cars, CASE WHEN mpg_mean >= 25 THEN 'High' WHEN mpg_mean >= 15 THEN 'Moderate' ELSE 'Low' END AS efficiency FROM cte_1 ", "", "", "", "")], 2, nothing, "", "", 0, false, true, false, false)

@create_view¤

Queries can also be reused as views. This can be especially helpful when joining complex queries together, however, it is not necessary.

query2 = @chain mtcars @filter(mpg>20) @mutate(mpg = mpg *4);
@chain mtcars begin
    @group_by cyl
    @summarize begin
        across(mpg, (mean, minimum, maximum))
        num_cars = n()
        end
    @mutate begin
        efficiency = case_when(
            mpg_mean >= 25, "High",
            mpg_mean >= 15, "Moderate",
            "Low" )
        end
    @create_view(viewer)
end;

@chain dt(con, "viewer") begin
    @left_join(query2, cyl == cyl)
    @summarize(avg_mean = mean(mpg), _by = efficiency)
    @mutate(mean = avg_mean / 4 )
    @collect
end

2×3 DataFrame

Row	efficiency	avg_mean	mean
	String	Float64	Float64
1	Moderate	84.5333	21.1333
2	High	106.655	26.6636

Preview or save an intermediate table¤

While querying a dataset, you may wish to see an intermediate table, or even save it. You can use @aside and _, illustrated below, to do just that. While we opted to print the results in this simple example below, we could have saved them by using name = @chain...

import ClickHouse;
conn = conn = connect(clickhouse(); host="localhost", port=19000, database="default", user="default", password="")
path = "https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"
@chain dt(conn, path) begin
   @count(artists)
   @aside println(@chain _ @head(5) @collect)
   @arrange(desc(count))
   @collect
end

5×2 DataFrame
 Row │ artists  count
     │ String?  UInt64
─────┼─────────────────
   1 │ missing       1
   2 │ Wizo          3
   3 │ MAGIC!        3
   4 │ Macaco        1
   5 │ SOYOU         1
31438×2 DataFrame
   Row │ artists          count
       │ String?          UInt64
───────┼─────────────────────────
     1 │ The Beatles         279
     2 │ George Jones        271
     3 │ Stevie Wonder       236
     4 │ Linkin Park         224
     5 │ Ella Fitzgerald     222
     6 │ Prateek Kuhad       217
     7 │ Feid                202
   ⋮   │        ⋮           ⋮
 31432 │ Leonard               1
 31433 │ marcos g              1
 31434 │ BLVKSHP               1
 31435 │ Memtrix               1
 31436 │ SOYOU                 1
 31437 │ Macaco                1
 31438 │ missing               1
               31424 rows omitted

This page was generated using Literate.jl.