class: left, top, title-slide .title[ # Textual Data
Example: Jane Austen ] .author[ ### Keith VanderLinden
Calvin University ] --- # Example: Austen Books .pull-left[ ```r library(tidyverse) library(tidytext) library(janeaustenr) ``` ```r tidy_austen_books <- austen_books() %>% group_by(book) %>% mutate( linenumber = row_number(), chapter = cumsum( str_detect( text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>% ungroup() %>% unnest_tokens(word, text) tidy_austen_books ``` ] .pull-right[ ``` ## # A tibble: 725,055 × 4 ## book linenumber chapter word ## <fct> <int> <int> <chr> ## 1 Sense & Sensibility 1 0 sense ## 2 Sense & Sensibility 1 0 and ## 3 Sense & Sensibility 1 0 sensibility ## 4 Sense & Sensibility 3 0 by ## 5 Sense & Sensibility 3 0 jane ## 6 Sense & Sensibility 3 0 austen ## 7 Sense & Sensibility 5 0 1811 ## 8 Sense & Sensibility 10 1 chapter ## 9 Sense & Sensibility 10 1 1 ## 10 Sense & Sensibility 13 1 the ## # … with 725,045 more rows ``` ] ??? --- # Example: Austen Sentiment .pull-left[ ```r austen_sentiment <- tidy_austen_books %>% inner_join(get_sentiments()) %>% group_by(book) %>% count(sentiment) %>% pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% mutate(sentiment = (positive - negative) / (positive + negative)) austen_sentiment ``` ] .pull-right[ ``` ## # A tibble: 6 × 4 ## # Groups: book [6] ## book negative positive sentiment ## <fct> <int> <int> <dbl> ## 1 Sense & Sensibility 3671 4933 0.147 ## 2 Pride & Prejudice 3652 5052 0.161 ## 3 Mansfield Park 4828 6749 0.166 ## 4 Emma 4809 7157 0.196 ## 5 Northanger Abbey 2518 3244 0.126 ## 6 Persuasion 2201 3473 0.224 ``` ] ???