Uploaded by SA Akundi

Creating Corpus of Text Documents

advertisement
#Creating Corpus of Text Documents
cname <- file.path(".", "TxtData")
cname
length(dir(cname))
dir(cname)
library(tm)
docs <- Corpus(DirSource(cname))
docs
class(docs)
summary(docs)
#Exploring the Corpus
library(magrittr)
inspect(docs[16])
viewDocs <- function(d, n) {d %>% extract2(n) %>% as.character() %>% writeLines()}
viewDocs(docs,16)
#Simple Transformations: For example, we might want to replace \/", used sometimes to
separate alternative words, with a space.
#This will avoid the two words being run into one string of characters through the
transformations. We might also
#replace \@" and \|" with a space, for the same reason.
toSpace <- content_transformer(function(x,pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs <-tm_map(docs, toSpace, " \ " )
inspect(docs[1])
#conversion to lower case
docs <- tm_map(docs, content_transformer(tolower))
inspect(docs[1])
#Removing Numbers from the document
docs <- tm_map (docs, removeNumbers)
inspect(docs[1])
#Remove Punctuation
docs <- tm_map(docs, removePunctuation)
inspect(docs[1])
#Removing Stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
inspect(docs[1])
#Removing OwnStopWords
docs <- tm_map(docs, removeWords, c("can", "figure","will", "equation","use", "publications",
"inc","following"))
docs <- tm_map(docs, removeWords,
c("two","however","thus","using","shows","may","example","since","let","shown"))
docs <- tm_map(docs, removeWords, c("see", "doi","used","given","three","possible","paper",
"also","based", "section"))
docs <- tm_map(docs, removeWords,
c("table","now","result","every","even","obtained","found","four","eca","type"))
docs <- tm_map(docs, removeWords,
c("study","consider","show","way","left","section","journal","note","means"))
docs <- tm_map(docs, removeWords,
c("second","follows","next","corresponding","particular","one","first","therefore"))
docs <- tm_map(docs, removeWords,
c("another","called","described","well","fact","must","like","similar","hence"))
docs <- tm_map(docs, removeWords,
c("within","due","university","refer","right","single","results","introduction","references"))
#strippingWhite Space
docs <-tm_map(docs, stripWhitespace)
#StemmingDocument:
#Stemming uses an algorithm that removes common word endings for English words, such as
"es","ed" and 's".
docs <-tm_map(docs, stemDocument)
inspect(docs[1])
#Creating Document Term Matrix
dtm <- DocumentTermMatrix(docs)
dtm
class(dtm)
dim(dtm)
#Obtaining Term Frequencies
freq <- colSums(as.matrix(dtm))
length(freq)
#ordering frequencies to list most and least frequent terms
ord <-order(freq)
#listing leastfrequent terms
freq[head(ord)]
#listing ost frequent terms
freq[tail(ord)]
#removing Sparse Terms from teh corpus/documents
dim(dtm)
dtms <-removeSparseTerms(dtm,0.2)
dim(dtms)
inspect(dtms)
#converting document term matrix to a simple matrix adn saving as a CSV
#m <- as.matrix(dtm)
#write.csv(m,file="dtm.csv")
#identifying frequent temrs and assocaitions
findFreqTerms(dtm, lowfreq =500)
findAssocs(dtm, "complex", corlimit=.7)
#Ploting word Frequency
freq<-sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 14)
# wf<- data.frame(word=names(freq), freq=freq)
# head(wf)
#
# library(ggplot2)
# subset(wf, freq>700)
# ggplot(aes(word,freq))
# geom_bar(stat="identity")
# theme(axis.text.x=element_text(angle=45, hjust=1))
#
wf=data.frame(term=names(freq),occurences=freq)
library(ggplot2)
p <-ggplot(subset(wf,freq>1000),aes(term,occurences))
p<- p+geom_bar(stat="identity")
p<-p+theme(axis.text.x=element_text(angle=45,hjust=1))
p
#WordCloud
library(wordcloud)
set.seed(123)
#wordcloud(names(freq), freq, min.freq=500)
#wordcloud(names(freq), freq, min.freq=500, colors=brewer.pal(6, "Dark2"))
wordcloud(names(freq), freq, min.freq=1000, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
#Word Clustering
#MEthod 1
distMatrix <- dist(scale(dtms))
dtms.fit <-hclust(distMatrix,method="ward.D2")
plot(fit)
#plot(dtms.fit,cex=0.9, hang=-1, main="word cluster Dendogram")
#rect.hclust(dtms.fit,k=5)
#MEthod 2
library(cluster)
d <- dist(t(dtms), method="euclidian")
fit <- hclust(d=d, method="ward.D2") # for a different look try substituting: method="ward.D"
or "complete"
fit
plot(fit, hang=-1)
#clustering the dendogram: K=number of clusters
plot.new()
plot(fit,hang=-1)
groups <- cuttree(fit,k=4)
rect.hclust(fit, k=4, border="red")
#Kmeans Clustering
library(fpc)
d<-dist(t(dtms), method="euclidian")
kfit<-kmeans(d,5)
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)
#requires Rgraphviz package to execute
source("https://bioconductor.org/biocLite.R")
biocLite("Rgraphviz")
plot(dtm,
terms=findFreqTerms(dtm, lowfreq=800),
corThreshold=0.3)
library(tm)
tdm <-TermDocumentMatrix(docs, control =list(wordLength=c(1,Inf)))
tdm
dim(tdm)
#create matrix
tdm.m <- as.matrix(tdm)
#to boolean matrix
tdm.m[tdm.m>=1] <- 1
# to term adjacency matrix
# %*% is product of 2 matrices
tdm.m2 <- tdm.m %*% t(tdm.m)
# build graph with igraph ####
library(igraph)
# build adjacency graph
tdm.g <- graph.adjacency(tdm.m2, weighted=TRUE, mode="undirected")
# remove loops
tdm.g <- simplify(tdm.g)
# set labels and degrees of vertices
V(tdm.g)$label <- V(tdm.g)$name
V(tdm.g)$degree <- degree(tdm.g)
# plot layout fruchterman.reingold
layout1 <- layout.fruchterman.reingold(tdm.g)
plot(tdm.g, layout=layout1, vertex.size=20,
vertex.label.color="darkred")
Download