Rev. | 362b496852abb158f51e9501936754431c20cdd7 |
---|---|
Tamaño | 5,775 octetos |
Tiempo | 2023-11-10 22:35:56 |
Autor | Lorenzo Isella |
Log Message | I added a better plot of the LDA and I now first determine the optimal number of topics. |
rm(list=ls())
library(quanteda)
require(quanteda.textstats)
require(quanteda.textplots)
library(tidytext)
library(readtext)
library(tidyverse)
library(quanteda.sentiment)
library(seededlda)
library(stm)
library(ldatuning)
source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")
data(stop_words)
## df <- readtext("speech2023.txt")
df <- readtext("./*txt",
docvarsfrom = "filenames"## ,
## docvarnames = c("unit", "context", "year", "language", "party"),
## dvsep = "_",
## encoding = "ISO-8859-1"
)
mycorpus <- corpus(df)
summary(mycorpus)
## a better way to generate tokens taken from
## https://tutorials.quanteda.io/multilingual/english-german/
toks <- tokens(mycorpus, remove_punct = TRUE, remove_numbers = TRUE,
remove_url = TRUE, remove_symbols = TRUE ) |>
tokens_remove(pattern = stopwords("en", source = "marimo")) |>
tokens_keep(pattern = "^[a-zA-Z]+$", valuetype = "regex")## |>
## tokens_tolower() ## No need to move everything to lowercase since
## dfm does it and the token_compound is case insensitive
## |> ## it is a question whether to stem or not
## tokens_wordstem()
dfm_mat <- dfm(toks) ### by default the dfm function moves everything to lowercase
bigrams <- tokens_ngrams(toks, 2)
toks_eu_bigram <- tokens_compound(toks, pattern = phrase("Europe *")) ## by default this is case insensitive.
toks_eu_bigram_select <- tokens_select(toks_eu_bigram, pattern = phrase("Europe_*"))
tf <- topfeatures(dfm_mat, 10)
tstat_key <- textstat_keyness(dfm_mat ,
target = "speech2023.txt"
)
df_keyness <- tstat_key |>
as_tibble()
gpl <- textplot_keyness(tstat_key, n=10)
ggsave("vdl-keyness.pdf", gpl, width=8,height=8)
## sentiment analysis
## Various methodologies to do sentiment analysis
sent_vdl <- mycorpus |>
textstat_polarity(data_dictionary_LSD2015)
sent_vdl
valence(data_dictionary_LSD2015) <- list(positive = 1, negative = -1,
neg_negative = 1, neg_positive = -1)
sent_vdl2 <- textstat_valence(toks, data_dictionary_LSD2015)
sent_vdl2
## see https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html
## determing the optimal number of topics
result <- FindTopicsNumber(
dfm_mat,
topics = seq(from = 2, to = 15, by = 1),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
mc.cores = 2L,
verbose = TRUE
)
FindTopicsNumber_plot(result)
## 8-10 topics are a reasonable number
### simple LDA topic modeling
## See https://koheiw.github.io/seededlda/articles/pkgdown/basic.html
## for some reason working on the trimmed dfm does not work
## dfmt <- dfm_mat |>
## dfm_remove("*@*") |>
## dfm_trim(max_docfreq = 0.1, docfreq_type = "prop")
lda <- textmodel_lda(dfm_mat, k = 9, verbose = TRUE)
knitr::kable(terms(lda))
## top 10 terms per topic
top10 <- terms(lda, n = 10) |>
as_tibble() |>
pivot_longer(cols=starts_with("t"),
names_to="topic", values_to="word")
phi <- lda$phi |>
as_tibble(rownames="topic") |>
pivot_longer(cols=c(-topic))
top10phi <- top10 |>
left_join(y=phi, by=c("topic", "word"="name")) ##finally I have a tibble I can work with.
top10phi
dd2 <- sort_facets(top10phi, topic, word, category2, value)
gpl <- ggplot(dd2, aes(y=category2, x=value)) +
geom_bar(stat = "identity") +
facet_wrap(. ~ topic, scales = "free_y", nrow=3) +
scale_y_discrete(labels=dd2$word, breaks=dd2$category2,
)+
xlab("Probability")+
ylab(NULL)
ggsave("lda-topic-keywords.pdf", gpl, width=8,height=8)
### now another approach based on stm
## see https://rstudio-pubs-static.s3.amazonaws.com/406792_9287b832dd9e413f97243628cb2f7ddb.html
## convert the dfm to a format suitable to stm.
dfm2stm <- convert(dfm_mat, to = "stm")
model.stm <- stm(dfm2stm$documents, dfm2stm$vocab, K = 9, data = dfm2stm$meta,
init.type = "Spectral")
## I make the model tidy.
## See https://juliasilge.com/blog/sherlock-holmes-stm/
stm_tidy <- tidy(model.stm)
gpl <- stm_tidy |>
group_by(topic) |>
top_n(10, beta) |>
ungroup() |>
mutate(topic = paste0("Topic ", topic),
term = reorder_within(term, beta, topic)) |>
ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(alpha = 0.8, show.legend = FALSE) +
facet_wrap(~ topic, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
labs(x = NULL, y = expression(beta),
title = "Highest word probabilities for each topic",
subtitle = "Different words are associated with different topics")
ggsave("stm-keywords.pdf", gpl, width=8,height=8)
######################################
## ## let us see the tidy approach
## dfm_tidy <- tidy(dfm_mat)
## corpus_tidy <- tidy(mycorpus)
## df_tidy <- list.files(pattern = "*.txt") %>%
## map_chr(~ read_file(.)) %>%
## tibble(text = .)
## df_clean1 <- clean_text("speech2023.txt") |>
## mutate(origin="2023 speech")
## df_clean2 <- clean_text("speech2022.txt") |>
## mutate(origin="2022 speech")
## df_clean <- rbind(df_clean1, df_clean2) |>
## mutate(text=tolower(text))
## df_uni <- df_clean |>
## clean_unigrams(stop_words)
## df_big <- df_clean |>
## clean_bigrams(stop_words) |>
## filter(word1=="europe")
## df_big_count <- df_clean |>
## group_by(origin) |>
## count_bigrams(stop_words) |>
## ungroup() |>
## arrange(origin, desc(n)) |>
## filter(word1=="europe")
print("So far so good")