Notes: - Cover all the key functions required for the lab but, perhaps, simplify and/or remove some interesting exercises. - DSBox uses the “SelectorGadget”, which I’d rather not use. Use “inspect element” instead to find the appropriate HTML structural element to pull. - Cf. https://rstudio-education.github.io/datascience-box/course-materials/slides/u2-d19-top-250-imdb/u2-d19-top-250-imdb.html
library(robotstxt)
paths_allowed("http://www.imdb.com")
##
www.imdb.com
## [1] TRUE
paths_allowed("http://www.facebook.com")
##
www.facebook.com
## [1] FALSE
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
page <- read_html("https://www.imdb.com/chart/top/")
page
## {html_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...
titles <- page %>%
html_nodes(".titleColumn a") %>%
html_text()
titles[1:10]
## [1] "The Shawshank Redemption"
## [2] "The Godfather"
## [3] "The Dark Knight"
## [4] "The Godfather: Part II"
## [5] "12 Angry Men"
## [6] "Schindler's List"
## [7] "The Lord of the Rings: The Return of the King"
## [8] "Pulp Fiction"
## [9] "The Lord of the Rings: The Fellowship of the Ring"
## [10] "The Good, the Bad and the Ugly"
length(titles)
## [1] 250
years <- page %>%
html_nodes(".secondaryInfo") %>%
html_text() %>%
str_remove("\\(") %>%
str_remove("\\)") %>%
as.numeric()
years[1:10]
## [1] 1994 1972 2008 1974 1957 1993 2003 1994 2001 1966
length(years)
## [1] 250
ratings <- page %>%
html_nodes("strong") %>%
html_text() %>%
as.numeric()
ratings[1:10]
## [1] 9.2 9.2 9.0 9.0 9.0 8.9 8.9 8.8 8.8 8.8
length(ratings)
## [1] 250
imdb_top_250 <- tibble(
title = titles,
year = years,
rating = ratings
)
imdb_top_250 <- imdb_top_250 %>%
mutate(rank = 1:nrow(imdb_top_250)) %>%
relocate(rank)
imdb_top_250
## # A tibble: 250 × 4
## rank title year rating
## <int> <chr> <dbl> <dbl>
## 1 1 The Shawshank Redemption 1994 9.2
## 2 2 The Godfather 1972 9.2
## 3 3 The Dark Knight 2008 9
## 4 4 The Godfather: Part II 1974 9
## 5 5 12 Angry Men 1957 9
## 6 6 Schindler's List 1993 8.9
## 7 7 The Lord of the Rings: The Return of the King 2003 8.9
## 8 8 Pulp Fiction 1994 8.8
## 9 9 The Lord of the Rings: The Fellowship of the Ring 2001 8.8
## 10 10 The Good, the Bad and the Ugly 1966 8.8
## # … with 240 more rows
The number of movies per year.
imdb_top_250 %>%
count(year, sort = TRUE)
## # A tibble: 86 × 2
## year n
## <dbl> <int>
## 1 1995 8
## 2 2004 7
## 3 1957 6
## 4 1999 6
## 5 2003 6
## 6 2009 6
## 7 2019 6
## 8 1975 5
## 9 1994 5
## 10 1997 5
## # … with 76 more rows
imdb_top_250 %>%
filter(year == 2021) %>%
print(n = 8)
## # A tibble: 2 × 4
## rank title year rating
## <int> <chr> <dbl> <dbl>
## 1 148 Spider-Man: No Way Home 2021 8.1
## 2 235 Jai Bhim 2021 8
The average rating of movies per year
imdb_top_250 %>%
group_by(year) %>%
summarise(avg_score = mean(rating)) %>%
ggplot(aes(y = avg_score, x = year)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Year", y = "Average score")
## `geom_smooth()` using formula = 'y ~ x'