Notes: - Cover all the key functions required for the lab but, perhaps, simplify and/or remove some interesting exercises. - DSBox uses the “SelectorGadget”, which I’d rather not use. Use “inspect element” instead to find the appropriate HTML structural element to pull. - Cf. https://rstudio-education.github.io/datascience-box/course-materials/slides/u2-d19-top-250-imdb/u2-d19-top-250-imdb.html

library(robotstxt)
paths_allowed("http://www.imdb.com")
## 
 www.imdb.com
## [1] TRUE
paths_allowed("http://www.facebook.com")
## 
 www.facebook.com
## [1] FALSE
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(rvest)
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
page <- read_html("https://www.imdb.com/chart/top/")
page
## {html_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body id="styleguide-v2" class="fixed">\n            <img height="1" widt ...
titles <- page %>%
  html_nodes(".titleColumn a") %>%
  html_text()

titles[1:10]
##  [1] "The Shawshank Redemption"                         
##  [2] "The Godfather"                                    
##  [3] "The Dark Knight"                                  
##  [4] "The Godfather: Part II"                           
##  [5] "12 Angry Men"                                     
##  [6] "Schindler's List"                                 
##  [7] "The Lord of the Rings: The Return of the King"    
##  [8] "Pulp Fiction"                                     
##  [9] "The Lord of the Rings: The Fellowship of the Ring"
## [10] "The Good, the Bad and the Ugly"
length(titles)
## [1] 250
years <- page %>%
  html_nodes(".secondaryInfo") %>%
  html_text() %>%
  str_remove("\\(") %>%
  str_remove("\\)") %>%
  as.numeric()
years[1:10]
##  [1] 1994 1972 2008 1974 1957 1993 2003 1994 2001 1966
length(years)
## [1] 250
ratings <- page %>%
  html_nodes("strong") %>%
  html_text() %>%
  as.numeric()
ratings[1:10]
##  [1] 9.2 9.2 9.0 9.0 9.0 8.9 8.9 8.8 8.8 8.8
length(ratings)
## [1] 250
imdb_top_250 <- tibble(
  title = titles, 
  year = years, 
  rating = ratings
  )
imdb_top_250 <- imdb_top_250 %>%
  mutate(rank = 1:nrow(imdb_top_250)) %>%
  relocate(rank)  
imdb_top_250
## # A tibble: 250 × 4
##     rank title                                              year rating
##    <int> <chr>                                             <dbl>  <dbl>
##  1     1 The Shawshank Redemption                           1994    9.2
##  2     2 The Godfather                                      1972    9.2
##  3     3 The Dark Knight                                    2008    9  
##  4     4 The Godfather: Part II                             1974    9  
##  5     5 12 Angry Men                                       1957    9  
##  6     6 Schindler's List                                   1993    8.9
##  7     7 The Lord of the Rings: The Return of the King      2003    8.9
##  8     8 Pulp Fiction                                       1994    8.8
##  9     9 The Lord of the Rings: The Fellowship of the Ring  2001    8.8
## 10    10 The Good, the Bad and the Ugly                     1966    8.8
## # … with 240 more rows

The number of movies per year.

imdb_top_250 %>% 
  count(year, sort = TRUE)
## # A tibble: 86 × 2
##     year     n
##    <dbl> <int>
##  1  1995     8
##  2  2004     7
##  3  1957     6
##  4  1999     6
##  5  2003     6
##  6  2009     6
##  7  2019     6
##  8  1975     5
##  9  1994     5
## 10  1997     5
## # … with 76 more rows
imdb_top_250 %>% 
  filter(year == 2021) %>%
  print(n = 8)
## # A tibble: 2 × 4
##    rank title                    year rating
##   <int> <chr>                   <dbl>  <dbl>
## 1   148 Spider-Man: No Way Home  2021    8.1
## 2   235 Jai Bhim                 2021    8

The average rating of movies per year

imdb_top_250 %>% 
  group_by(year) %>%
  summarise(avg_score = mean(rating)) %>%
  ggplot(aes(y = avg_score, x = year)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Year", y = "Average score")
## `geom_smooth()` using formula = 'y ~ x'