Анализ частот встречаемости слов
Материал из Поле цифровой дидактики
Токенизация и частоты слов в литературе с tidytext Основные понятия: Токенизация (unnest_tokens), стоп-слова (stop_words, anti_join), загрузка корпусов (gutenberg_download), пропорции частот (proportion = n / sum(n)), переформатирование данных (pivot_wider/long).
text <- c("Because I could not stop for Death -",
"He kindly stopped for me -",
"The Carriage held but just Ourselves -",
"and Immortality")
library(dplyr)
library(tidytext)
text_df <- tibble(line = 1:4, text = text)
text_df %>%
unnest_tokens(word, text)
data(stop_words)
library(gutenbergr)
hgwells <- gutenberg_download(c(35, 36, 5230, 159))
tidy_hgwells <- hgwells %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
tidy_hgwells %>%
count(word, sort = TRUE)
bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767))
tidy_bronte <- bronte %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
tidy_bronte %>%
count(word, sort = TRUE)
frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
mutate(tidy_hgwells, author = "H.G. Wells")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
pivot_wider(names_from = author, values_from = proportion) %>%
pivot_longer(`Brontë Sisters`:`H.G. Wells`,
names_to = "author", values_to = "proportion")
frequency
