Examples from https://www.tidytextmining.com/sentiment.html
library(tidyverse)
library(tidytext)
library(janeaustenr)
tidy_austen_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
tidy_austen_books
## # A tibble: 725,055 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # … with 725,045 more rows
tidy_austen_books %>%
filter(book == "Pride & Prejudice")
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # … with 122,194 more rows
positive_words <- get_sentiments() %>%
filter(sentiment == "positive")
positive_words
## # A tibble: 2,005 × 2
## word sentiment
## <chr> <chr>
## 1 abound positive
## 2 abounds positive
## 3 abundance positive
## 4 abundant positive
## 5 accessable positive
## 6 accessible positive
## 7 acclaim positive
## 8 acclaimed positive
## 9 acclamation positive
## 10 accolade positive
## # … with 1,995 more rows
tidy_austen_books %>%
inner_join(positive_words, by = "word") %>%
count(word, sort = TRUE)
## # A tibble: 977 × 2
## word n
## <chr> <int>
## 1 well 1523
## 2 good 1380
## 3 great 981
## 4 like 725
## 5 better 639
## 6 enough 613
## 7 happy 534
## 8 love 495
## 9 pleasure 462
## 10 happiness 369
## # … with 967 more rows
This explains why I like Jane Austen - every book is net positive in sentiment.
austen_sentiment <- tidy_austen_books %>%
inner_join(get_sentiments()) %>%
group_by(book) %>%
count(sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = (positive - negative) / (positive + negative))
## Joining, by = "word"
austen_sentiment
## # A tibble: 6 × 4
## # Groups: book [6]
## book negative positive sentiment
## <fct> <int> <int> <dbl>
## 1 Sense & Sensibility 3671 4933 0.147
## 2 Pride & Prejudice 3652 5052 0.161
## 3 Mansfield Park 4828 6749 0.166
## 4 Emma 4809 7157 0.196
## 5 Northanger Abbey 2518 3244 0.126
## 6 Persuasion 2201 3473 0.224