#installpackage # data wrangling install.packages("dplyr") install.packages("tidyr") install.packages("lubridate") # visualization install.packages("ggplot2") # dealing with text install.packages("textclean") install.packages("tm") install.packages("SnowballC") install.packages("stringr") # topic model install.packages("tidytext") install.packages("topicmodels") install.packages("textmineR") #activatepackage # data wrangling library(dplyr) library(tidyr) library(lubridate) # visualization library(ggplot2) # dealing with text library(textclean) library(tm) library(SnowballC) library(stringr) # topic model library(tidytext) library(topicmodels) library(textmineR) data <- read.csv("D:/TOPIC MODELING R STUDIO/predictedresults.csv") data <- data %>% mutate(overall = as.factor(overall)) select(reviewText, overall,reviewTime) head(data) # build textcleaner function textcleaner <- function(x){ x <- as.character(x) x <- x %>% str_to_lower() %>% # convert all the string to low alphabet replace_contraction() %>% # replace contraction to their multi-word forms replace_internet_slang() %>% # replace internet slang to normal words replace_emoji() %>% # replace emoji to words replace_emoticon() %>% # replace emoticon to words replace_hash(replacement = "") %>% # remove hashtag replace_word_elongation() %>% # replace informal writing with known semantic replacements replace_number(remove = T) %>% # remove number replace_date(replacement = "") %>% # remove date replace_time(replacement = "") %>% # remove time str_remove_all(pattern = "[[:punct:]]") %>% # remove punctuation str_remove_all(pattern = "[^\\s]*[0-9][^\\s]*") %>% # remove mixed string n number str_squish() %>% # reduces repeated whitespace inside a string. str_trim() # removes whitespace from start and end of string xdtm <- VCorpus(VectorSource(x)) %>% tm_map(removeWords, c(stopwords("english"),"i","posted","poste","sticking", "tongue", "yourselves","yourself","yours","your","your","you","you","would","wkwkwkwkwkwkwkwkwk","wkwkwkwkwkwk","wkwkwkwkwk","wkwkwkwk","wkwkwk","wkwk","wkkwkwkwkwkwkwk","with","with","will","will","why","whom","who","who","while","while","which","which","where","where","when","when","what","what","were","well","we","way","was","was","want","very","very","up","up","until","under","too","too","to","to","through","through","those","this","this","they","they","these","there","there","then","then","themselves","them","them","theirs","their","their","the","the","that","that","than","than","t","t","such","still","some","some","so","so","should","she","september","same","s","s","review","recommend","really","re","pros","pro","posted","own","over","out","out","ourselves","ours","our","other","other","or","or","only","only","one","once","on","on","off","of","of","now","november","november","not","not","nor","no","no","myself","my","my","much","most","most","more","more","me","me","may","may","march","march","make","m","lot","like","just","just","june","july","january","itself","its","its","it","it","is","is","into","into","in","in","if","if","i","how","how","his","himself","him","herself","hers","here","her","he","having","have","have","has","has","hahahahahahahahaha","hahahahahahahaha","hahahahahahaha","hahahahahaha","hahahahaha","hahahaha","hahaha","haha","had","had","get","games","game","further","from","from","for","for","first","few","february","february","even","each","during","down","don","don","doing","does","do","do","did","december","cons","con","can","can","by","by","but","but","both","between","below","being","before","been","because","because","be","be","august","at","at","as","as","are","are","april","any","and","and","an","an","am","also","all","all","against","again","after","after","above","about","about","a")) # convert corpus to document term matrix return(DocumentTermMatrix(xdtm)) } data_1 <- data %>% filter(overall == 1) data_0 <- data %>% filter(overall == 0) table(data$overall) #positive sentiment # apply textcleaner function for review text dtm_1 <- textcleaner(data_1$reviewText) # find most frequent terms. i choose words that at least appear in 50 reviews freqterm_1 <- findFreqTerms(dtm_1,50) # we have 981 words. subset the dtm to only choose those selected words dtm_1 <- dtm_1[,freqterm_1] # only choose words that appear once in each rows rownum_1 <- apply(dtm_1,1,sum) dtm_1 <- dtm_1[rownum_1>0,] # apply to LDA function. set the k = 6, means we want to build 6 topic lda_1 <- LDA(dtm_1,k = 6,control = list(seed = 1502)) # apply auto tidy using tidy and use beta as per-topic-per-word probabilities topic_1 <- tidy(lda_1,matrix = "beta") # choose 15 words with highest beta from each topic top_terms_1 <- topic_1 %>% group_by(topic) %>% top_n(15,beta) %>% ungroup() %>% arrange(topic,-beta) # plot the topic and words for easy interpretation plot_topic_1 <- top_terms_1 %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + coord_flip() + scale_x_reordered() plot_topic_1 #negative sentiment # apply textcleaner function for review text dtm_0 <- textcleaner(data_0$reviewText) # find most frequent terms. i choose words that at least appear in 50 reviews freqterm_0 <- findFreqTerms(dtm_0,50) # we have 981 words. subset the dtm to only choose those selected words dtm_0 <- dtm_0[,freqterm_0] # only choose words that appear once in each rows rownum_0 <- apply(dtm_0,1,sum) dtm_0 <- dtm_0[rownum_0>0,] # apply to LDA function. set the k = 6, means we want to build 6 topic lda_0 <- LDA(dtm_0,k = 6,control = list(seed = 1502)) # apply auto tidy using tidy and use beta as per-topic-per-word probabilities topic_0 <- tidy(lda_0,matrix = "beta") # choose 15 words with highest beta from each topic top_terms_0 <- topic_0 %>% group_by(topic) %>% top_n(15,beta) %>% ungroup() %>% arrange(topic,-beta) # plot the topic and words for easy interpretation plot_topic_0 <- top_terms_0 %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + coord_flip() + scale_x_reordered() plot_topic_0