austen_sentiment.knit

Examples from https://www.tidytextmining.com/sentiment.html

library(tidyverse)
library(tidytext)
library(janeaustenr)

tidy_austen_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
tidy_austen_books

## # A tibble: 725,055 × 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # … with 725,045 more rows

tidy_austen_books %>%
  filter(book == "Pride & Prejudice")

## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # … with 122,194 more rows

positive_words <- get_sentiments() %>%
  filter(sentiment == "positive")
positive_words

## # A tibble: 2,005 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abound      positive 
##  2 abounds     positive 
##  3 abundance   positive 
##  4 abundant    positive 
##  5 accessable  positive 
##  6 accessible  positive 
##  7 acclaim     positive 
##  8 acclaimed   positive 
##  9 acclamation positive 
## 10 accolade    positive 
## # … with 1,995 more rows

tidy_austen_books %>%
  inner_join(positive_words, by = "word") %>%
  count(word, sort = TRUE)

## # A tibble: 977 × 2
##    word          n
##    <chr>     <int>
##  1 well       1523
##  2 good       1380
##  3 great       981
##  4 like        725
##  5 better      639
##  6 enough      613
##  7 happy       534
##  8 love        495
##  9 pleasure    462
## 10 happiness   369
## # … with 967 more rows

This explains why I like Jane Austen - every book is net positive in sentiment.

austen_sentiment <- tidy_austen_books %>%
  inner_join(get_sentiments()) %>%
  group_by(book) %>%
  count(sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = (positive - negative) / (positive + negative))

## Joining, by = "word"

austen_sentiment

## # A tibble: 6 × 4
## # Groups:   book [6]
##   book                negative positive sentiment
##   <fct>                  <int>    <int>     <dbl>
## 1 Sense & Sensibility     3671     4933     0.147
## 2 Pride & Prejudice       3652     5052     0.161
## 3 Mansfield Park          4828     6749     0.166
## 4 Emma                    4809     7157     0.196
## 5 Northanger Abbey        2518     3244     0.126
## 6 Persuasion              2201     3473     0.224