I had never tried text mining with R before. I came accross this great little tutorial and thought I would use it to help me do some basic text mining on my PhD thesis document, which I first saved as a .txt file.

Set up

setwd("C://Users//Dan/Documents//GitHub//text_mining_my_PHD_thesis")


## Install ##
 # install.packages("tm")  # for text mining
# install.packages("SnowballC") # for text stemming
# install.packages("wordcloud") # word-cloud generator 
# install.packages("RColorBrewer") # color palettes


# Load
library("tm")
## Loading required package: NLP
library("SnowballC")
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")

Clean the text

# Read the text file
text <- readLines("my_PhD_thesis.txt")
# Load the data as a corpus
docs <- Corpus(VectorSource(text))

#Text transformation
toSpace <- content_transformer(function(x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, "oâ???T")
docs <- tm_map(docs, toSpace, "\\^oâ")


# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("timetask", "http", "doi", "org", "fig", 
                                    "non", "nes", "via", "pardo")) 
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming
# docs <- tm_map(docs, stemDocument)

# inspect(docs)

Build a term-document matrix

dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing = TRUE)
d <- data.frame(word = names(v),freq = v)
head(d, 25)
##                      word freq
## attention       attention  536
## spatial           spatial  474
## time                 time  362
## task                 task  357
## bias                 bias  341
## right               right  321
## left                 left  296
## asymmetry       asymmetry  271
## target             target  259
## participants participants  247
## light               light  228
## hemisphere     hemisphere  220
## effect             effect  171
## visual             visual  162
## dat                   dat  161
## visuospatial visuospatial  136
## alertness       alertness  130
## data                 data  129
## initial           initial  109
## load                 load  109
## pre                   pre  108
## brain               brain  107
## power               power  107
## shift               shift  105
## dopamine         dopamine  103

Generate the Word cloud from my PhD Thesis

set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words = 200, random.order = FALSE, rot.per = 0.35, 
          colors = brewer.pal(8, "Dark2"))

Explore associations

e.g. words associated with “attention”:

findAssocs(dtm, terms = "attention", corlimit = 0.3)
## $attention
##            spatial              right          alertness 
##               0.71               0.61               0.60 
##         hemisphere           evidence               left 
##               0.53               0.49               0.48 
##            network           networks            healthy 
##               0.47               0.47               0.45 
##            ventral             dorsal        hemispheric 
##               0.45               0.44               0.44 
##       specifically       visuospatial        lateralised 
##               0.43               0.43               0.42 
##          orienting           patients           inferior 
##               0.42               0.42               0.41 
##          sustained      understanding             caused 
##               0.41               0.41               0.40 
##           emergent             harder                ifg 
##               0.40               0.40               0.40 
##                ipl            marlies                may 
##               0.40               0.40               0.40 
##             poorly         vulnerable             engage 
##               0.40               0.40               0.39 
##           modulate          rightward            studies 
##               0.39               0.39               0.39 
##              asked           corbetta         activation 
##               0.38               0.38               0.37 
##            example        inattention               show 
##               0.37               0.37               0.37 
##            shulman              trait             whilst 
##               0.37               0.37               0.37 
##     asymmetrically          disorders              gyrus 
##               0.36               0.36               0.36 
##              space              found          influence 
##               0.36               0.35               0.35 
##               also               blue           parietal 
##               0.34               0.34               0.34 
##            regions             taxing              adler 
##               0.34               0.34               0.33 
##            brouwer           elicited            fasotti 
##               0.33               0.33               0.33 
##            foucher             geurts          johannsen 
##               0.33               0.33               0.33 
##             kessel         progressed           although 
##               0.33               0.33               0.32 
##           auditory        behavioural            chapter 
##               0.32               0.32               0.32 
##           enriched            factors               paus 
##               0.32               0.32               0.32 
##            results           suggests           addition 
##               0.32               0.32               0.31 
##           exposure             ingram           leftward 
##               0.31               0.31               0.31 
## neurophysiological              peers            poynter 
##               0.31               0.31               0.31 
##             proven           activity               cued 
##               0.31               0.30               0.30 
##           decrease         decreasing            driving 
##               0.30               0.30               0.30