I had never tried text mining with R before. I came accross this great little tutorial and thought I would use it to help me do some basic text mining on my PhD thesis document, which I first saved as a .txt file.
setwd("C://Users//Dan/Documents//GitHub//text_mining_my_PHD_thesis")
## Install ##
# install.packages("tm") # for text mining
# install.packages("SnowballC") # for text stemming
# install.packages("wordcloud") # word-cloud generator
# install.packages("RColorBrewer") # color palettes
# Load
library("tm")
## Loading required package: NLP
library("SnowballC")
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")
# Read the text file
text <- readLines("my_PhD_thesis.txt")
# Load the data as a corpus
docs <- Corpus(VectorSource(text))
#Text transformation
toSpace <- content_transformer(function(x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, "oâ???T")
docs <- tm_map(docs, toSpace, "\\^oâ")
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("timetask", "http", "doi", "org", "fig",
"non", "nes", "via", "pardo"))
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming
# docs <- tm_map(docs, stemDocument)
# inspect(docs)
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing = TRUE)
d <- data.frame(word = names(v),freq = v)
head(d, 25)
## word freq
## attention attention 536
## spatial spatial 474
## time time 362
## task task 357
## bias bias 341
## right right 321
## left left 296
## asymmetry asymmetry 271
## target target 259
## participants participants 247
## light light 228
## hemisphere hemisphere 220
## effect effect 171
## visual visual 162
## dat dat 161
## visuospatial visuospatial 136
## alertness alertness 130
## data data 129
## initial initial 109
## load load 109
## pre pre 108
## brain brain 107
## power power 107
## shift shift 105
## dopamine dopamine 103
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words = 200, random.order = FALSE, rot.per = 0.35,
colors = brewer.pal(8, "Dark2"))
e.g. words associated with “attention”:
findAssocs(dtm, terms = "attention", corlimit = 0.3)
## $attention
## spatial right alertness
## 0.71 0.61 0.60
## hemisphere evidence left
## 0.53 0.49 0.48
## network networks healthy
## 0.47 0.47 0.45
## ventral dorsal hemispheric
## 0.45 0.44 0.44
## specifically visuospatial lateralised
## 0.43 0.43 0.42
## orienting patients inferior
## 0.42 0.42 0.41
## sustained understanding caused
## 0.41 0.41 0.40
## emergent harder ifg
## 0.40 0.40 0.40
## ipl marlies may
## 0.40 0.40 0.40
## poorly vulnerable engage
## 0.40 0.40 0.39
## modulate rightward studies
## 0.39 0.39 0.39
## asked corbetta activation
## 0.38 0.38 0.37
## example inattention show
## 0.37 0.37 0.37
## shulman trait whilst
## 0.37 0.37 0.37
## asymmetrically disorders gyrus
## 0.36 0.36 0.36
## space found influence
## 0.36 0.35 0.35
## also blue parietal
## 0.34 0.34 0.34
## regions taxing adler
## 0.34 0.34 0.33
## brouwer elicited fasotti
## 0.33 0.33 0.33
## foucher geurts johannsen
## 0.33 0.33 0.33
## kessel progressed although
## 0.33 0.33 0.32
## auditory behavioural chapter
## 0.32 0.32 0.32
## enriched factors paus
## 0.32 0.32 0.32
## results suggests addition
## 0.32 0.32 0.31
## exposure ingram leftward
## 0.31 0.31 0.31
## neurophysiological peers poynter
## 0.31 0.31 0.31
## proven activity cued
## 0.31 0.30 0.30
## decrease decreasing driving
## 0.30 0.30 0.30