Harbest.jl
Web Scraping is a technique to get data from the internet. In this package you can do this to get data from a static HTML.
This is a port form tidyverse/Rvest
Usage
using Harbest
starwars = read_html("https://rvest.tidyverse.org/articles/starwars.html")
titles = html_elements(starwars, ["section", "h2"]) |> html_text3
titles
# 7-element Vector{String}:
# "The Phantom Menace"
# "Attack of the Clones"
# "Revenge of the Sith"
# ⋮
# "Return of the Jedi"
# "The Force Awakens"
html = read_html("https://en.wikipedia.org/w/index.php?title=The_Lego_Movie&oldid=998422565")
table = html_elements(html, ".tracklist") |> html_table
table
# 28×4 DataFrame
# Row │ No. Title Performer(s) Length
# │ String String String String
# ─────┼──────────────────────────────────────────────────────────────────────────────────────
# 1 │ 1. "Everything Is Awesome" Tegan and Sara featuring The Lon… 2:43
# 2 │ 2. "Prologue" 2:28
# 3 │ 3. "Emmett's Morning" 2:00
# 4 │ 4. "Emmett Falls in Love" 1:11
# 5 │ 5. "Escape" 3:26
# ⋮ │ ⋮ ⋮ ⋮ ⋮
# 25 │ 25. "Everything Is Awesome" Jo Li (Joshua Bartholomew and Li… 1:26
# 26 │ 26. "Everything Is Awesome (unplugge… Shawn Patterson and Sammy Allen 1:24
# 27 │ 27. "Untitled Self Portrait" Will Arnett 1:08
# 28 │ 28. "Everything Is Awesome (instrume… 2:41
# 19 rows omitted
Functions
- read_html(url)
- htmlelements(html,string) or htmlelements(html,strings)
- htmlattrs(html,string) or htmlattrs(html)
- htmltext(html) or htmltext2(html) or html_text3(html)
- html_table(html)
Tutorial
First, we import
using Harbest, DataFrames, PlotlyJS
Then, scrape the data with html_elements
, html_attrs
and html_text3
function get_scores(html)
score = html_elements(html,".ipl-rating-star__rating") |> html_text3 ## Read scores from HTML
score = score[score .!= "Rate" .&& occursin.(".",score)] ## Get actual scores
scores::Vector{Float64} = parse.(Float64,score)
return scores
end
function get_names(html)
names::Vector{String} = html_elements(html,[".info","strong"]) |> html_text3
return names
end
function get_imgs(html)
data = html_elements(html,["img",".zero-z-index"])
imgs::Vector{String} = html_attrs(data,"src")
return imgs
end
function get_n_season(html)
data = read_html(html)
data = html_elements(data,["select","option"])[2] |> html_text3
n_season::Int = parse(Int,data)
return n_season
end
function get_df(url)
df::DataFrame = DataFrame()
n_seasons = get_n_season(url)
urls = url.*"episodes?season=".*string.(1:n_seasons)
for i in eachindex(urls)
html = read_html(urls[i])
temp_df = DataFrame(scores = get_scores(html),
names = get_names(html),
season = i,
images = get_imgs(html))
df = [df;temp_df]
end
df[!,"N"]= rownumber.(eachrow(df))
return df
end
function plot_df(df,title)
return plot(df,
x = :N,
y = :scores,
text = :names,
color = :season,
mode = "lines",
labels=Dict(
:N => "Episode number",
:scores => "Score",
:season => "Season"
),
Layout(title = title* " score on IMDb")
)
end
community_df = get_df("https://www.imdb.com/title/tt1439629/")
plot_df(community_df,"Community")
bojack_df = get_df("https://www.imdb.com/title/tt3398228/")
plot_df(bojack_df,"Bojack Horseman")