#installpackage
# data wrangling
install.packages("dplyr")
install.packages("tidyr")
install.packages("lubridate")
# visualization
install.packages("ggplot2")
# dealing with text
install.packages("textclean")
install.packages("tm")
install.packages("SnowballC")
install.packages("stringr")
# topic model
install.packages("tidytext")
install.packages("topicmodels")
install.packages("textmineR")

#activatepackage
# data wrangling
library(dplyr)
library(tidyr)
library(lubridate)
# visualization
library(ggplot2)
# dealing with text
library(textclean)
library(tm)
library(SnowballC)
library(stringr)
# topic model
library(tidytext)
library(topicmodels)
library(textmineR)

data <- read.csv("D:/TOPIC MODELING R STUDIO/predictedresults.csv")

data <- data %>%
mutate(overall = as.factor(overall))
select(reviewText, overall,reviewTime)
head(data)

# build textcleaner function
textcleaner <- function(x){
  x <- as.character(x)
  
  x <- x %>%
    str_to_lower() %>%  # convert all the string to low alphabet
    replace_contraction() %>% # replace contraction to their multi-word forms
    replace_internet_slang() %>% # replace internet slang to normal words
    replace_emoji() %>% # replace emoji to words
    replace_emoticon() %>% # replace emoticon to words
    replace_hash(replacement = "") %>% # remove hashtag
    replace_word_elongation() %>% # replace informal writing with known semantic replacements
    replace_number(remove = T) %>% # remove number
    replace_date(replacement = "") %>% # remove date
    replace_time(replacement = "") %>% # remove time
    str_remove_all(pattern = "[[:punct:]]") %>% # remove punctuation
    str_remove_all(pattern = "[^\\s]*[0-9][^\\s]*") %>% # remove mixed string n number
    str_squish() %>% # reduces repeated whitespace inside a string.
    str_trim() # removes whitespace from start and end of string
  
  xdtm <- VCorpus(VectorSource(x)) %>%
    tm_map(removeWords, c(stopwords("english"),"i","posted","poste","sticking", "tongue", "yourselves","yourself","yours","your","your","you","you","would","wkwkwkwkwkwkwkwkwk","wkwkwkwkwkwk","wkwkwkwkwk","wkwkwkwk","wkwkwk","wkwk","wkkwkwkwkwkwkwk","with","with","will","will","why","whom","who","who","while","while","which","which","where","where","when","when","what","what","were","well","we","way","was","was","want","very","very","up","up","until","under","too","too","to","to","through","through","those","this","this","they","they","these","there","there","then","then","themselves","them","them","theirs","their","their","the","the","that","that","than","than","t","t","such","still","some","some","so","so","should","she","september","same","s","s","review","recommend","really","re","pros","pro","posted","own","over","out","out","ourselves","ours","our","other","other","or","or","only","only","one","once","on","on","off","of","of","now","november","november","not","not","nor","no","no","myself","my","my","much","most","most","more","more","me","me","may","may","march","march","make","m","lot","like","just","just","june","july","january","itself","its","its","it","it","is","is","into","into","in","in","if","if","i","how","how","his","himself","him","herself","hers","here","her","he","having","have","have","has","has","hahahahahahahahaha","hahahahahahahaha","hahahahahahaha","hahahahahaha","hahahahaha","hahahaha","hahaha","haha","had","had","get","games","game","further","from","from","for","for","first","few","february","february","even","each","during","down","don","don","doing","does","do","do","did","december","cons","con","can","can","by","by","but","but","both","between","below","being","before","been","because","because","be","be","august","at","at","as","as","are","are","april","any","and","and","an","an","am","also","all","all","against","again","after","after","above","about","about","a")) 
  
  # convert corpus to document term matrix
  return(DocumentTermMatrix(xdtm))
  
}

data_1 <- data %>% filter(overall == 1)
data_0 <- data %>% filter(overall == 0)

table(data$overall)

#positive sentiment
# apply textcleaner function for review text
dtm_1 <- textcleaner(data_1$reviewText)
# find most frequent terms. i choose words that at least appear in 50 reviews
freqterm_1 <- findFreqTerms(dtm_1,50)
# we have 981 words. subset the dtm to only choose those selected words
dtm_1 <- dtm_1[,freqterm_1]
# only choose words that appear once in each rows
rownum_1 <- apply(dtm_1,1,sum)
dtm_1 <- dtm_1[rownum_1>0,]
# apply to LDA function. set the k = 6, means we want to build 6 topic 
lda_1 <- LDA(dtm_1,k = 6,control = list(seed = 1502))
# apply auto tidy using tidy and use beta as per-topic-per-word probabilities
topic_1 <- tidy(lda_1,matrix = "beta")

# choose 15 words with highest beta from each topic
top_terms_1 <- topic_1 %>%
  group_by(topic) %>%
  top_n(15,beta) %>% 
  ungroup() %>%
  arrange(topic,-beta)
# plot the topic and words for easy interpretation
plot_topic_1 <- top_terms_1 %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

plot_topic_1

#negative sentiment
# apply textcleaner function for review text
dtm_0 <- textcleaner(data_0$reviewText)
# find most frequent terms. i choose words that at least appear in 50 reviews
freqterm_0 <- findFreqTerms(dtm_0,50)
# we have 981 words. subset the dtm to only choose those selected words
dtm_0 <- dtm_0[,freqterm_0]
# only choose words that appear once in each rows
rownum_0 <- apply(dtm_0,1,sum)
dtm_0 <- dtm_0[rownum_0>0,]
# apply to LDA function. set the k = 6, means we want to build 6 topic 
lda_0 <- LDA(dtm_0,k = 6,control = list(seed = 1502))
# apply auto tidy using tidy and use beta as per-topic-per-word probabilities
topic_0 <- tidy(lda_0,matrix = "beta")

# choose 15 words with highest beta from each topic
top_terms_0 <- topic_0 %>%
  group_by(topic) %>%
  top_n(15,beta) %>% 
  ungroup() %>%
  arrange(topic,-beta)
# plot the topic and words for easy interpretation
plot_topic_0 <- top_terms_0 %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

plot_topic_0