Topic Modeling

Chris Bail
Duke University

What is Topic Modeling?

Latent Dirichlet Allocation

Example: LDA of Scientific Abstracts

Running Your First Topic Model

library(topicmodels)
data("AssociatedPress")

Running Your First Topic Model

AP_topic_model<-LDA(AssociatedPress, k=10, control = list(seed = 321))

Running Your First Topic Model

library(tidytext)
library(dplyr)
library(ggplot2)

AP_topics <- tidy(AP_topic_model, matrix = "beta")

ap_top_terms <- 
  AP_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

Plot

ap_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

Plot

Reading Tea Leaves

Structural Topic Modeling

Political Blogs Data

google_doc_id <- "1LcX-JnpGB0lU1iDnXnxB6WFqBywUKpew" # google file ID
poliblogs<-read.csv(sprintf("https://docs.google.com/uc?id=%s&export=download", google_doc_id), stringsAsFactors = FALSE)

Pre-Process

library(stm)
processed <- textProcessor(poliblogs$documents, metadata = poliblogs)

Pre-Process

out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
docs <- out$documents
vocab <- out$vocab
meta <-out$meta

Running a Structural Topic Model

First_STM <- stm(documents = out$documents, vocab = out$vocab,
              K = 10, prevalence =~ rating + s(day) ,
              max.em.its = 75, data = out$meta,
              init.type = "Spectral", verbose = FALSE)

Plot Top Words

plot(First_STM)

Plot

Find Exemplary Passages

findThoughts(First_STM, texts = poliblogs$documents,
     n = 2, topics = 3)

Choosing k

findingk <- searchK(out$documents, out$vocab, K = c(10, 30),
 prevalence =~ rating + s(day), data = meta, verbose=FALSE)

plot(findingk)

Working with meta-data

Working with meta-data

predict_topics<-estimateEffect(formula = 1:10 ~ rating + s(day), stmobj = First_STM, metadata = out$meta, uncertainty = "Global")

Plot

plot(predict_topics, covariate = "rating", topics = c(3, 5, 9),
 model = First_STM, method = "difference",
 cov.value1 = "Liberal", cov.value2 = "Conservative",
 xlab = "More Conservative ... More Liberal",
 main = "Effect of Liberal vs. Conservative",
 xlim = c(-.1, .1), labeltype = "custom",
 custom.labels = c('Topic 3', 'Topic 5','Topic 9'))

Plot

Plot Topic Prevalence over TIme

plot(predict_topics, "day", method = "continuous", topics = 3,
model = z, printlegend = FALSE, xaxt = "n", xlab = "Time (2008)")
monthseq <- seq(from = as.Date("2008-01-01"),
to = as.Date("2008-12-01"), by = "month")
monthnames <- months(monthseq)
axis(1,at = as.numeric(monthseq) - min(as.numeric(monthseq)),
labels = monthnames)

Plot

Limitations of topic models