Text Mining in R: Corpus Creation & Analysis

#Creating Corpus of Text Documents cname <- file.path(".", "TxtData") cname length(dir(cname)) dir(cname) library(tm) docs <- Corpus(DirSource(cname)) docs class(docs) summary(docs) #Exploring the Corpus library(magrittr) inspect(docs[16]) viewDocs <- function(d, n) {d %>% extract2(n) %>% as.character() %>% writeLines()} viewDocs(docs,16) #Simple Transformations: For example, we might want to replace \/", used sometimes to separate alternative words, with a space. #This will avoid the two words being run into one string of characters through the transformations. We might also #replace \@" and \|" with a space, for the same reason. toSpace <- content_transformer(function(x,pattern) gsub(pattern, " ", x)) docs <- tm_map(docs, toSpace, "/") docs <- tm_map(docs, toSpace, "@") docs <- tm_map(docs, toSpace, "\\|") docs <-tm_map(docs, toSpace, " \ " ) inspect(docs[1]) #conversion to lower case docs <- tm_map(docs, content_transformer(tolower)) inspect(docs[1]) #Removing Numbers from the document docs <- tm_map (docs, removeNumbers) inspect(docs[1]) #Remove Punctuation docs <- tm_map(docs, removePunctuation) inspect(docs[1]) #Removing Stopwords docs <- tm_map(docs, removeWords, stopwords("english")) inspect(docs[1]) #Removing OwnStopWords docs <- tm_map(docs, removeWords, c("can", "figure","will", "equation","use", "publications", "inc","following")) docs <- tm_map(docs, removeWords, c("two","however","thus","using","shows","may","example","since","let","shown")) docs <- tm_map(docs, removeWords, c("see", "doi","used","given","three","possible","paper", "also","based", "section")) docs <- tm_map(docs, removeWords, c("table","now","result","every","even","obtained","found","four","eca","type")) docs <- tm_map(docs, removeWords, c("study","consider","show","way","left","section","journal","note","means")) docs <- tm_map(docs, removeWords, c("second","follows","next","corresponding","particular","one","first","therefore")) docs <- tm_map(docs, removeWords, c("another","called","described","well","fact","must","like","similar","hence")) docs <- tm_map(docs, removeWords, c("within","due","university","refer","right","single","results","introduction","references")) #strippingWhite Space docs <-tm_map(docs, stripWhitespace) #StemmingDocument: #Stemming uses an algorithm that removes common word endings for English words, such as "es","ed" and 's". docs <-tm_map(docs, stemDocument) inspect(docs[1]) #Creating Document Term Matrix dtm <- DocumentTermMatrix(docs) dtm class(dtm) dim(dtm) #Obtaining Term Frequencies freq <- colSums(as.matrix(dtm)) length(freq) #ordering frequencies to list most and least frequent terms ord <-order(freq) #listing leastfrequent terms freq[head(ord)] #listing ost frequent terms freq[tail(ord)] #removing Sparse Terms from teh corpus/documents dim(dtm) dtms <-removeSparseTerms(dtm,0.2) dim(dtms) inspect(dtms) #converting document term matrix to a simple matrix adn saving as a CSV #m <- as.matrix(dtm) #write.csv(m,file="dtm.csv") #identifying frequent temrs and assocaitions findFreqTerms(dtm, lowfreq =500) findAssocs(dtm, "complex", corlimit=.7) #Ploting word Frequency freq<-sort(colSums(as.matrix(dtm)), decreasing=TRUE) head(freq, 14) # wf<- data.frame(word=names(freq), freq=freq) # head(wf) # # library(ggplot2) # subset(wf, freq>700) # ggplot(aes(word,freq)) # geom_bar(stat="identity") # theme(axis.text.x=element_text(angle=45, hjust=1)) # wf=data.frame(term=names(freq),occurences=freq) library(ggplot2) p <-ggplot(subset(wf,freq>1000),aes(term,occurences)) p<- p+geom_bar(stat="identity") p<-p+theme(axis.text.x=element_text(angle=45,hjust=1)) p #WordCloud library(wordcloud) set.seed(123) #wordcloud(names(freq), freq, min.freq=500) #wordcloud(names(freq), freq, min.freq=500, colors=brewer.pal(6, "Dark2")) wordcloud(names(freq), freq, min.freq=1000, scale=c(5, .1), colors=brewer.pal(6, "Dark2")) #Word Clustering #MEthod 1 distMatrix <- dist(scale(dtms)) dtms.fit <-hclust(distMatrix,method="ward.D2") plot(fit) #plot(dtms.fit,cex=0.9, hang=-1, main="word cluster Dendogram") #rect.hclust(dtms.fit,k=5) #MEthod 2 library(cluster) d <- dist(t(dtms), method="euclidian") fit <- hclust(d=d, method="ward.D2") # for a different look try substituting: method="ward.D" or "complete" fit plot(fit, hang=-1) #clustering the dendogram: K=number of clusters plot.new() plot(fit,hang=-1) groups <- cuttree(fit,k=4) rect.hclust(fit, k=4, border="red") #Kmeans Clustering library(fpc) d<-dist(t(dtms), method="euclidian") kfit<-kmeans(d,5) clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0) #requires Rgraphviz package to execute source("https://bioconductor.org/biocLite.R") biocLite("Rgraphviz") plot(dtm, terms=findFreqTerms(dtm, lowfreq=800), corThreshold=0.3) library(tm) tdm <-TermDocumentMatrix(docs, control =list(wordLength=c(1,Inf))) tdm dim(tdm) #create matrix tdm.m <- as.matrix(tdm) #to boolean matrix tdm.m[tdm.m>=1] <- 1 # to term adjacency matrix # %*% is product of 2 matrices tdm.m2 <- tdm.m %*% t(tdm.m) # build graph with igraph #### library(igraph) # build adjacency graph tdm.g <- graph.adjacency(tdm.m2, weighted=TRUE, mode="undirected") # remove loops tdm.g <- simplify(tdm.g) # set labels and degrees of vertices V(tdm.g)$label <- V(tdm.g)$name V(tdm.g)$degree <- degree(tdm.g) # plot layout fruchterman.reingold layout1 <- layout.fruchterman.reingold(tdm.g) plot(tdm.g, layout=layout1, vertex.size=20, vertex.label.color="darkred")

Text Mining in R: Corpus Creation & Analysis

Products

Support

Text Mining in R: Corpus Creation & Analysis

Add this document to collection(s)

Add this document to saved

Suggest us how to improve StudyLib