# load libraries
library(quanteda)
library(readtext)
library(stringi)
library(plyr)

#Make sure you choose the correct directory

########################
#Option 1: create your own corpora
########################

############
#create list of stop words, to be used in addition to smartwords
# http://wordlist.aspell.net/12dicts/
TOFCore <- read.delim("2of5core.txt",header = FALSE)
TOFCore <- as.character(TOFCore$V1)
############


############
#create the scientifc orpora and document frequency matrices (dfm)
#load the scientific corpus, available here:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5694964/
############

tempsci_H <- readtext("H*.txt", encoding = "UTF-8") %>% stri_trans_nfkc()
tempsci_L <- readtext("L*.txt", encoding = "UTF-8") %>% stri_trans_nfkc()
tempsci_S <- readtext("S*.txt", encoding = "UTF-8") %>% stri_trans_nfkc()
tempsci_P <- readtext("P*.txt", encoding = "UTF-8") %>% stri_trans_nfkc()

############
# create the scientific corpus, note that it is created from subsets based on subject.
############

mycorpussci_H <- corpus(tempsci_H)
mycorpussci_L <- corpus(tempsci_L)
mycorpussci_P <- corpus(tempsci_P)
mycorpussci_S <- corpus(tempsci_S)
mycorpussci_E <- mycorpussci_H + mycorpussci_L + mycorpussci_P + mycorpussci_S

############
# create a dfm
#remove stopwords, punctuation, numbers, hyphens
#set required variables for sci_E data frame.
############
mydfmsci_E <- dfm(mycorpussci_E, remove_punct = TRUE,remove=c(TOFCore,stopwords(source="smart"),"cid"),remove_numbers=TRUE,
stem = FALSE, remove_symbols = FALSE, remove_hyphens = TRUE)

sci_E <- textstat_frequency(mydfmsci_E)
sci_total_E <- sum(sci_E$frequency)
f_sci_E <- sci_E$frequency

############
#load the American English corpus
# http://www.anc.org/
############

tempTran_eng <- readtext("*.txt")
encodingType_eng <- encoding(tempTran_eng)$all

tempeng <- readtext("*.txt", encoding = encodingType_eng) %>% stringi::stri_trans_nfkc()
mycorpuseng <- corpus(tempeng)

############
# create a dfm
#remove stopwords, punctuation, numbers, hyphens
#set required variables for eng data frame.
############
mydfmeng <- dfm(mycorpuseng, remove_punct = TRUE,remove=c(TOFCore,stopwords(source="smart")),remove_numbers=TRUE,
stem = FALSE, remove_symbols = FALSE, remove_hyphens = TRUE)

eng <- textstat_frequency(mydfmeng)
eng_total <- sum(eng$frequency)
f_eng <- eng$frequency

########################
#END Option 1
########################


########################
#Option 2: read in pre-made corpora
########################

eng <- readRDS("eng.rds")
eng_total <- sum(eng$frequency)
f_eng <- eng$frequency
sci_E <- readRDS("sci_E.rds")
sci_total_E <- sum(sci_E$frequency)
f_sci_E <- sci_E$frequency

########################
#END Option 2
########################


############
#read in transcript(s), create a corpus, create a dfm
# the dimensions of the dfm are: feature frequency rank docfreq group
############

############
#for use if the files consist of text documents
#encoding is guessed by the readtext command. It is recommended you include the encoding if it is known.
#temp <- readtext("*.txt", encoding = "UTF-8")
#sampe file below from Gutenberg Press: The Adventures of Huckleberry Finn
############

temp <- readtext("https://www.gutenberg.org/files/76/76-0.txt")
filenames <- dir(pattern=("HF.txt"))

############
#for use if the files consist of pdfs
#sample pdf example.pdf
############

tempTran <- readtext("example.pdf")
encodingType <- encoding(tempTran)$all

filenames <- dir(pattern=("example.pdf"))
temp <- readtext("example.pdf", encoding = encodingType) %>% stri_trans_nfkc()

############
#Create the corpus from the transcript read above
############

mycorpus <- corpus(temp)
docvars(mycorpus, "Name") <- filenames
mydfm <- dfm(mycorpus, remove_punct = TRUE,remove_numbers=TRUE,
stem = FALSE, remove_symbols = FALSE, remove_hyphens = TRUE)

transcript <- mydfm@Dimnames$docs
trans <- textstat_frequency(mydfm, groups = transcript)
trans_total <- sum(trans$frequency)
a <-0
b <-0
j <-0
j_norm <- 0
j_mean <- 0
result = list()

############
#calculate j, add values to trans dataframe
############
trans$a <- f_sci_E[match(trans$feature,sci_E$feature)]/sci_total_E
trans$b <- f_eng[match(trans$feature,eng$feature)]/eng_total

trans$a[is.na(trans$a)] <- 0
trans$b[is.na(trans$b)] <- 0

trans$j <- ifelse ((trans$a>trans$b)&(trans$b>0),log10(trans$a/trans$b),

ifelse ((trans$a>0)&(trans$b==0),3,0))

############
#output is below
############

trans$j_norm <- sum((trans$j)*(trans$frequency))/trans_total

min(trans$j)
mean(trans$j_norm)
max(trans$j)
jargon_words <- subset(trans,j=="3")
print(paste("Total number of words:",trans_total,sep = " "))
print(paste("The mean jaronness is ",mean(trans$j_norm),sep = " "))
print(paste("Words with j value of 3 include: "))
subset(trans, j=="3", select = c(group,feature))