# load libraries
library(quanteda)
library(readtext)
library(stringi)
library(plyr)
library(RColorBrewer)
library(viridis)


#Make sure you choose the correct directory

########################
#Option 1: create your own corpora
########################

############
#create list of stop words, to be used in addition to smartwords
# http://wordlist.aspell.net/12dicts/
TOFCore <- read.delim("2of5core.txt",header = FALSE)
TOFCore <- as.character(TOFCore$V1)
############


############
#create the scientifc orpora and document frequency matrices (dfm)
#load the scientific corpus, available here:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5694964/
############

tempsci_H <- readtext("H*.txt", encoding = "UTF-8") %>% stri_trans_nfkc()
tempsci_L <- readtext("L*.txt", encoding = "UTF-8") %>% stri_trans_nfkc()
tempsci_S <- readtext("S*.txt", encoding = "UTF-8") %>% stri_trans_nfkc()
tempsci_P <- readtext("P*.txt", encoding = "UTF-8") %>% stri_trans_nfkc()

############
# create the scientific corpus, note that it is created from subsets based on subject.
############

mycorpussci_H <- corpus(tempsci_H)
mycorpussci_L <- corpus(tempsci_L)
mycorpussci_P <- corpus(tempsci_P)
mycorpussci_S <- corpus(tempsci_S)
mycorpussci_E <- mycorpussci_H + mycorpussci_L + mycorpussci_P + mycorpussci_S

############
# create a dfm
#remove stopwords, punctuation, numbers, hyphens
#set required variables for sci_E data frame.
############
mydfmsci_E <- dfm(mycorpussci_E, remove_punct = TRUE,remove=c(TOFCore,stopwords(source="smart"),"cid"),remove_numbers=TRUE,
stem = FALSE, remove_symbols = FALSE, remove_hyphens = TRUE)

sci_E <- textstat_frequency(mydfmsci_E)
sci_total_E <- sum(sci_E$frequency)
f_sci_E <- sci_E$frequency

############
#load the American English corpus
# http://www.anc.org/
############

tempTran_eng <- readtext("*.txt")
encodingType_eng <- encoding(tempTran_eng)$all

tempeng <- readtext("*.txt", encoding = encodingType_eng) %>% stringi::stri_trans_nfkc()
mycorpuseng <- corpus(tempeng)

############
# create a dfm
#remove stopwords, punctuation, numbers, hyphens
#set required variables for eng data frame.
############
mydfmeng <- dfm(mycorpuseng, remove_punct = TRUE,remove=c(TOFCore,stopwords(source="smart")),remove_numbers=TRUE,
stem = FALSE, remove_symbols = FALSE, remove_hyphens = TRUE)

eng <- textstat_frequency(mydfmeng)
eng_total <- sum(eng$frequency)
f_eng <- eng$frequency

########################
#END Option 1
########################


########################
#Option 2: read in pre-made corpora
########################

eng <- readRDS("eng.rds")
eng_total <- sum(eng$frequency)
f_eng <- eng$frequency
sci_E <- readRDS("sci_E.rds")
sci_total_E <- sum(sci_E$frequency)
f_sci_E <- sci_E$frequency

########################
#END Option 2
########################


############
#read in transcript(s), create a corpus, create a dfm
# the dimensions of the dfm are: feature frequency rank docfreq group
############

############
#for use if the files consist of text documents
#encoding is guessed by the readtext command. It is recommended you include the encoding if it is known.
#temp <- readtext("*.txt", encoding = "UTF-8")
#sampe file below from Gutenberg Press: The Adventures of Huckleberry Finn
############

tempTran <- readtext("*.txt")
encodingType <- encoding(tempTran)$all
filenames <- dir(pattern=("*.txt"))
temp <- readtext("*.txt", encoding = encodingType) %>% stri_trans_nfkc()

############
#Use the above script for .txt files, and the one below for pdf files.
############


tempTran <- readtext("*.pdf")
encodingType <- encoding(tempTran)$all
filenames <- dir(pattern=("*.pdf"))
temp <- readtext("*.pdf", encoding = encodingType) %>% stri_trans_nfkc()


############
#Create the corpus from the transcript read above
############

mycorpus <- corpus(temp)
docvars(mycorpus, "Name") <- filenames
mydfm <- dfm(mycorpus, remove_punct = TRUE,remove_numbers=TRUE,
stem = FALSE, remove_symbols = FALSE, remove_hyphens = TRUE)

transcript <- mydfm@Dimnames$docs
trans <- textstat_frequency(mydfm, groups = transcript)
trans_total <- sum(trans$frequency)
a <-0
b <-0
j <-0
j_norm <- 0


############
#calculate j, add values to trans dataframe
############
trans$a <- f_sci_E[match(trans$feature,sci_E$feature)]/sci_total_E
trans$b <- f_eng[match(trans$feature,eng$feature)]/eng_total

trans$a[is.na(trans$a)] <- 0
trans$b[is.na(trans$b)] <- 0

trans$j <- ifelse ((trans$a>trans$b)&(trans$b>0),log10(trans$a/trans$b),
ifelse ((trans$a>0)&(trans$b==0),3,0))


trans$j_norm <- (trans$j)*(trans$frequency)

############
#the following works for one single document
############

sum(trans$j_norm)/trans_total

############
#if there are multiple input documents, the following
#will calculate the average jargon per document
############


tapply(trans$j_norm,trans$group,sum)/tapply(trans$frequency,trans$group,sum)


############
#information about which words are likely jargon are printed below
############

jargon_words <- subset(trans,j=="3")
print(paste("Total number of words:",trans_total,sep = " "))
print(paste("Words with j value of 3 include: "))
subset(trans, j=="3", select = c(group,feature))

############
#one can create a word cloud below.
#two items of interest to control include:
#1. choosing how many words to include in the cloud, controlled by max_words (currently set at 100)
#2. which words to removed from the word cloud. currently, stop words (TOFCore and smart stopwords)
# extra words to be removed from the cloud are set as br and arra currently.
############

mydfm_wc <- dfm(mycorpus, remove_punct = TRUE,remove_numbers=TRUE,
stem = TRUE, remove_symbols = FALSE, remove_hyphens = TRUE,
remove = c(TOFCore,stopwords(source="smart"),"br","arra") )

textplot_wordcloud(mydfm_wc, max_words=100,

color=viridis(8))