19 Text Analysis in R

https://gccollab.ca/discussion/view/7404441/text-analysis-in-r

19.1 Open source textbook

https://smltar.com/ https://www.tidytextmining.com/ https://slcladal.github.io/topicmodels.html, https://slcladal.github.io/textanalysis.html

Also, a list of related resources and codes for text mining (including Web Scraping) are collected here: https://github.com/gorodnichy/LA-R-text

19.2 Plagiarism detection

Q: Any ideas/packages/resources (in R) for plagiarism detection?

A: A good place to start is the “stylo” package (https://github.com/computationalstylistics/stylo - R package for stylometric analyses) which implements a wide variety of recent research in computational stylistics. Plagiarism detection is fraught (insert all of the usual ethical and computational caveats…), but stylo can help you identify passages that are stylistically unusual compared to the rest of the text. Unusualness definitely isn’t a proxy for plagiarism, but it’s a good place to start.

Q: Is this focused on English language text? Are there lexicons or libraries for comparison within other languages (e.g., French)?

A: Stylo works well with quite a few non-English languages. French, for example, is supported, as are a number of languages with non-Latin alphabets like Arabic and Korean.

19.4 Useful code snippets

19.4.1 Basic cleaning : Remove accents (benchmarking)


dtCases <- fread("https://github.com/ishaberry/Covid19Canada/raw/master/cases.csv", stringsAsFactors = F )
dtCases %>% dim
system.time(dtCases [, city0 := health_region])
system.time(dtCases [, city1 := base::iconv (health_region, from="UTF-8", to="ASCII//TRANSLIT")])
system.time(dtCases [, city2 := textclean::replace_non_ascii (health_region)])
system.time(dtCases [, city3 := stringi::stri_trans_general (health_region,id = "Latin-ASCII")])
dtCases[city0!=city1, city0:city3] %>% unique


microbenchmark::microbenchmark(  
  dtCases [, city0 := iconv (health_region, to="ASCII//TRANSLIT")],
  dtCases [, city1 := iconv (health_region, to="ASCII//TRANSLIT")],
  dtCases [, city2 := textclean::replace_non_ascii (health_region)],
  dtCases [, city3 := stringi::stri_trans_general (health_region,id = "Latin-ASCII")],
  times=10)

# Unit: milliseconds
# expr       min        lq
# dtCases[, `:=`(city0, iconv(health_region, to = "ASCII//TRANSLIT"))]  166.8094  168.8169
# dtCases[, `:=`(city1, iconv(health_region, to = "ASCII//TRANSLIT"))]  165.9741  168.5937
# dtCases[, `:=`(city2, textclean::replace_non_ascii(health_region))] 8757.1358 8867.7838
# dtCases[, `:=`(city3, stringi::stri_trans_general(health_region,      id = "Latin-ASCII"))] 4204.2102 4230.9790
# mean    median        uq       max neval
# 172.8043  172.4714  174.7670  181.1422    10
# 173.8419  171.0345  172.6113  204.4243    10
# 9088.6954 9049.1962 9273.6495 9545.8301    10
# 4301.7987 4293.4896 4339.4618 4430.2948    10

19.4.2 Text cleaning

library(textclean)
# library(textshape) #with(DATA, split_portion(state, n.words = 10))
# https://github.com/trinker/textclean

if(F) {
  mgsub(textclean::DATA$state, c("i", "it"), c("<<I>>", "[[IT]]"))
  mgsub(DATA$state, "[[:punct:]]", "<<PUNCT>>", fixed = FALSE)
  x <- c(
    "<bold>Random</bold> text with symbols: &nbsp; &lt; &gt; &amp; &quot; &apos;",
    "<p>More text</p> &cent; &pound; &yen; &euro; &copy; &reg;",
    "Welcome to A I: the best W O R L D!",
    "6 Ekstr\xf8m", "J\xf6reskog", "bi\xdfchen Z\xfcrcher",
    "I'm liiiike whyyyyy me?", "Wwwhhatttt!"
  )
  
  replace_html(x)
  replace_kern(x)
  x;  Encoding(x) <- "latin1"; x
  replace_non_ascii(x)
  
  # Tokens
  ## Set Up the Tokens to Replace
  
  lexicon::grady_augmented # 122806 #English words
  
  lexicon::common_names # 5493 #Names
  nms <- gsub("(^.)(.*)", "\\U\\1\\L\\2", lexicon::common_names, perl = TRUE)
  head(nms)
  
  fuzzyjoin::misspellings %>% nrow() #4505
  
  replace_white(x)
  replace_word_elongation(x)
  
  
}

cleanWords <- function(phrase)  {
  # cleanWords <- function(phrase, changecase = c("no", "upper", "lower", "title"))  {
  phrase %>% 
    # gsub("[^[:alnum:]]", "", .) %>%
    gsub("[^[A-Za-z0-9 -]]", "", .) %>%
    gsub(" {2,}", " ", .) 
  # %>%
  # iconv(to="ASCII//TRANSLIT")     
  # replace_non_ascii
  # str_to_lower()
  # ifelse(changecase == "upper", str_to_upper(),
  #        ifelse (changecase == "lower", str_to_lower(),
  #                ifelse(changecase == "title", str_to_title(),
  #                       .)
  #        )
  # )
}

arrayTranslitFromCyrilic <- function (slovo="Дмитрий Городничий") {
  c(
    slovo %>% stri_trans_general("ukrainian-Latin/bgn")%>% cleanWords,
    slovo %>% stri_trans_general("Russian-Latin/bgn")%>% cleanWords,
    slovo %>% stri_trans_general("Bulgarian-Latin/bgn")%>% cleanWords,
    slovo %>% stri_trans_general("Belarusian-Latin/bgn")%>% cleanWords,
    slovo %>% stri_trans_general("Serbian-Latin/bgn")%>% cleanWords,
    slovo %>% stri_trans_general("cyrillic-latin")%>% cleanWords
  )
}

if (F) {
  # library(stringi)
  dt <- stri_trans_list() %>% data.table () %>% setnames("coding")#
  dt[coding %ilike% "Latin"][]
  dt[coding %ilike% "cyr"]
  dt[coding %ilike% "ascii"]
  dt1 <- iconvlist() %>% data.table () %>% setnames("coding")#
  dt1[coding %ilike% "ascii"]
  dt1[coding %ilike% "ASCII"]
  dt1[coding %ilike% "TRANSLIT"]
  
  "Montréal" %>% iconv(to="ASCII//TRANSLIT")
  
  c("шерсть","женщина", "челюсть", "Володимир Зеленський", "Дмитрий Городничий") %>% 
    sapply( translitFromCyrilic )
  
}

19.4.3 Extracting, re-ordering words in a string

https://stackoverflow.com/questions/55244680/extract-words-in-between-two-commas-in-r


word <- 'Fu Tien Mansion, Taikoo Shing, Hong Kong'
# and I want to extract the word in between the two commas and concatenate it with the first word, what regex to use?
# to get this: 'Taikoo Shing Fu Tien Mansion' 

# 1: sub  ----
sub("^([^,]+),\\s*([^,]+),.*", "\\2 \\1", word)
#[1] "Taikoo Shing Fu Tien Mansion"

# 2: -----

x <- base::strsplit(word, ",")[[1]]
paste(x[2], x[1])

# 3 -----
paste(
    trimws( # Remove Leading/Trailing Whitespace
        sapply(strsplit(word,","), `[`, 2)
        ), 
    trimws(sapply(strsplit(word,","), `[`, 1))
    )

19.4.4 Automatically finding / removing common parts in strings

https://stackoverflow.com/questions/48701107/find-length-of-overlap-in-strings

findStrOverlap <- function(str1, str2, ignore.case = FALSE) { # , verbose = FALSE

  if(ignore.case) {
    str1 <- tolower(str1);    str2 <- tolower(str2)
  }
  if(nchar(str1) < nchar(str2)) {
    x <- str2;    str2 <- str1;    str1 <- x
  }
    
  x <- strsplit(str2, "")[[1L]]
  n <- length(x)
  s <- sequence(seq_len(n))
  s <- split(s, cumsum(s == 1L))
  s <- rep(list(s), n)

  for(i in seq_along(s)) {
    s[[i]] <- lapply(s[[i]], function(x) {
      x <- x + (i-1L)
      x[x <= n]
    })
    s[[i]] <- unique(s[[i]])
  }

  s <- unlist(s, recursive = FALSE)
  s <- unique(s[order(-lengths(s))])

  i <- 1L
  len_s <- length(s)
  while(i < len_s) {
    lcs <- paste(x[s[[i]]], collapse = "")
    # if(verbose) cat("now checking:", lcs, "\n")
    check <- grepl(lcs, str1, fixed = TRUE)
    if(check) {
        # if(verbose) cat(paste0("Found: '",lcs,"' (length =", nchar(lcs), ") \n")) 
        break
    } else {
      i <- i + 1L 
    }
  }
  return (lcs)
}

library(data.table)
dt <- cansim::get_cansim("13-10-0810-01") %>% setDT(dt) 
dt <- data.table::data.table(
    GEO=c( # From CANSIM Table
        "Newfoundland and Labrador, place of occurrence",
        "Prince Edward Island, place of occurrence",     
        "Nova Scotia, place of occurrence"
))

aStr <- dt$GEO
removeCommonStrPart <- function(aStr) {
    str0 <- findStrOverlap( aStr[1],  aStr[2]); str0
    str_replace(aStr, str0, "")
}

dt[, GEO:=removeCommonStrPart(GEO)][]
#                         GEO
#                      <char>
#1: Newfoundland and Labrador
#2:      Prince Edward Island
#3:               Nova Scotia

19.4.5 Useful packages

stringdist

stringdist::stringdist(aStr[1],  aStr[2], method="lcs") # 29
stringdist::stringdistmatrix(dt$GEO, dt$GEO, method="lcs") %>% as.data.table 
stringdist::stringdist(aStr[1],  aStr[2], method='qgram',q=2) # 37

19.4.5.1 Not as useful ….

tidystringdist - converts stringdist results into tibble. not optimized, very slow
Better to use write code yourself to do that using data.table !

19.4.5.2 Language Detection and converting

https://stackoverflow.com/questions/8078604/detect-text-language-in-r

# 2.1 ----
library(textcat)
# The textcat Package for n-Gram Based Text Categorization in R. Journal of Statistical Software, http://www.jstatsoft.org/v52/i06/

# 2.2 cld2 - best + archived cran ----

library(cld2) # the fastest of all
# library(cld3) #

if( F) {
  url <- "http://cran.us.r-project.org/src/contrib/Archive/cldr/cldr_1.1.0.tar.gz"
  pkgFile<-"cldr_1.1.0.tar.gz"
  download.file(url = url, destfile = pkgFile)
  install.packages(pkgs=pkgFile, type="source", repos=NULL)
  unlink(pkgFile)
  # or devtools::install_version("cldr",version="1.1.0")
  
  #usage
  library(cldr)
  demo(cldr)
}

# The cldr package in a previous answer is not any more available on CRAN and may be difficult to install. However, Google's (Chromium's) cld libraries are now available in R through other dedicated packages, cld2 and cld3. 

# 2.3 ----
# An approach in R would be to keep a text file of English words. I have several of these including one from http://www.sil.org/linguistics/wordlists/english/. After sourcing the .txt file you can use this file to match against each tweet. Something like:

lapply(tweets, function(x) EnglishWordComparisonList %in% x)


# 2.9 benchmarking - data! ----

data(reuters, package = "kernlab") # a corpus of articles in english
reuters
length(reuters)
# [1] 40
sapply(reuters ,nchar)
# [1] 1311  800  511 2350  343  388 3705  604  254  239  632  607  867  240
# [15]  234  172  538  887 2500 1030  538 2681  338  402  563 2825 2800  947
# [29] 2156 2103 2283  604  632  602  642  892 1187  472 1829  367
text <- unlist(reuters)

microbenchmark::microbenchmark(
  textcat = textcat::textcat(text),
  cld2 = cld2::detect_language(text),
  cld3 = cld3::detect_language(text),
  detect_from_sw = detect_from_sw(text,c("english","french","german")),
  times=10)


# Text analysis ----

# . reading / shaping text ----

# ???
library(textshape)
# https://github.com/trinker/textshape
# textshape is small suite of text reshaping and restructuring functions. Many of these functions are descended from tools in the qdapTools package. This brings reshaping tools under one roof with specific functionality of the package limited to text reshaping.
# This package is meant to be used jointly with the textclean package, which provides cleaning and text normalization functionality. Additionally, the textreadr package is designed to import various common text data sources into R for reshaping and cleaning.

library(textreadr)
# pdf_doc <- system.file("docs/rl10075oralhistoryst002.pdf", package = "textreadr")
# html_doc <- system.file('docs/textreadr_creed.html', package = "textreadr")
# 
#Some other implementations of text readers in R:
# tm
# readtext

'https://github.com/trinker/textreadr/raw/master/inst/docs/pres.deb1.docx' %>%
  download() %>%
  read_docx() %>%
  head(3)
# > Plagiarism detection ----  



# https://cran.r-project.org/web/packages/RNewsflow/vignettes/RNewsflow.html

library(RNewsflow)



#it also installs
library(fastmatch)
library(quanteda)
rnewsflow_dfm 

# https://cran.r-project.org/web/packages/corpustools/vignettes/corpustools.html

library(corpustools)
# corpustools: Managing, Querying and Analyzing Tokenized Text
# Provides text analysis in R, focusing on the use of a tokenized text format. 



# syuzhet: Extracts Sentiment and Sentiment-Derived Plot Arcs from Text
library(syuzhet)


# 5 > Web crawling ----


# fuzzyjoin::misspellings ------
## Not run: 
library(rvest)
library(readr)
library(dplyr)
library(stringr)
library(tidyr)

u <- "https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines"
h <- read_html(u)

misspellings <- h %>%
  html_nodes("pre") %>%
  html_text() %>%
  readr::read_delim(col_names = c("misspelling", "correct"), delim = ">",
                    skip = 1) %>%
  mutate(misspelling = str_sub(misspelling, 1, -2)) %>%
  unnest(correct = str_split(correct, ", ")) %>%
  filter(Encoding(correct) != "UTF-8")

## End(Not run)


## selectr

# https://cran.r-project.org/web/packages/selectr/
#   https://sjp.co.nz/projects/selectr
# 
#   selectr: Translate CSS Selectors to XPath Expressions
# Translates a CSS3 selector into an equivalent XPath expression. This allows us to use CSS selectors when working with the XML package as it can only evaluate XPath expressions. Also provided are convenience functions useful for using CSS selectors on XML nodes. This package is a port of the Python package 'cssselect' (<https://cssselect.readthedocs.io/>).
# 
# Suggests: testthat, XML, xml2
# Reverse imports:  cliapp, ganalytics, Rcrawler, rvest, wikilake


##  Rcrawler

# Rcrawler: Web Crawler and Scraper
# Performs parallel web crawling and web scraping. It is designed to crawl, parse and store web pages to produce data that can be directly used for analysis application.
# https://cran.r-project.org/web/packages/Rcrawler
# https://github.com/salimk/Rcrawler/
#   RCrawler: An R package for parallel web crawling and scraping,  https://www.sciencedirect.com/science/article/pii/S2352711017300110?via%3Dihub
# 
# Version:  0.1.9-1
# Imports:  httr, xml2, data.table, foreach, doParallel, parallel, selectr, webdriver, callr, jsonlite
# Published:    2018-11-11
# 
# https://cran.r-project.org/web/packages/xml2/index.html

19.4.6 cleanText(text): clean text

19.4.6.1 Data cleaning Tasks:

  • Stripping non relevant symbols, including spaces
  • Contractions and Incomplete words
  • abbreviations and slang
  • Kerning (“B O M B!”)
  • Elongations (looong)
  • converting from Non-ASCII, removing accents
  • cleaning Numbers: 99,999; 99.999; 99.99%; 99,99%; 2nd
  • normalizing words to a common form (eg Title)
  • fixing common typos

19.4.6.2 Example 1:


x <- c("i like", "<p>i want. </p>. thet them ther .", "I am ! that|", "", NA, 
       "&quot;they&quot; they,were there", ".", "   ", "?", "3;", "I like goud eggs!", 
       "bi\xdfchen Z\xfcrcher", "i 4like...", "\\tgreat",  "She said \"yes\"")
Encoding(x) <- "latin1"
x <- as.factor(x)


renderPrint(x)


### Problem diagnosis

check_text(x)

19.4.7 Convert Text to Date or Timestamp

  
text2date <- function(a) {
  
  a <- a %>% str_replace("XII", "12") %>%
    str_replace("XI", "11")%>%
    str_replace("IX", "9")%>%
    str_replace("X", "10")%>%
    str_replace("VIII", "8")%>%
    str_replace("VII", "7")%>%
    str_replace("VI", "6")%>%
    str_replace("IV", "4")%>%
    str_replace("V", "5")%>%
    str_replace("III", "3")%>%
    str_replace("II", "2")%>%
    str_replace("I", "1")
  
  x <-  dmy(a);
  if (is.na(x)) x <-  dym(a)
  if (is.na(x)) x <-  ymd(a)
  if (is.na(x)) x <-  ydm(a)
  if (is.na(x)) x <-  mdy(a)
  if (is.na(x)) x <-  myd(a)
  if (is.na(x)) x <-  ymd_hms(a)
  
  if (year(x)>3000) x <- x - years(100)
  return (x)
}

# text2dtDate
text2dtYYMMDD <- function(a, text=F, date=F, timestamp=F) {
  x <- text2date(a)
  dt <- data.table(YY=year(x) %>% as.integer(), MM=month(x)%>% as.integer(), DD=day(x)%>% as.integer() )
  # cols <- 1:ncol(dt)
  # dt[, (cols):=lapply(.SD, as.integer), .SDcols=cols]
  if (text)  dt %<>% cbind(data.table(text=a))
  if (date)  dt %<>% cbind(data.table(date=x))
  if (timestamp)  dt %<>% cbind(data.table(timestamp=now("EST")))
  return(dt)
}


text2timestamp <- function (a) {
  x <-  ymd_hms (a);
  if (is.na(x)) x <- ymd_hm (a)
  if (is.na(x)) x <- ymd_h (a)
  if (is.na(x)) x <- dmy_hms (a)
  if (is.na(x)) x <- dmy_hm (a)
  if (is.na(x)) x <- dmy_h (a)
  if (is.na(x)) x <- mdy_hms (a)
  if (is.na(x)) x <- mdy_hm (a)
  if (is.na(x)) x <- mdy_h (a)
  if (is.na(x)) x <- ydm_hms (a)
  if (is.na(x)) x <- ydm_hm (a)
  if (is.na(x)) x <- ydm_h (a) 
  if (is.na(x)) x <- text2date (a)
  return(x)
}

19.4.8 Transliteration & cleaning

  cleanPhrase <- function(phrase)  {
  
  phrase %>% 
    gsub("[^[:alnum:]]", "", .) %>%
    gsub("[^[A-Za-z0-9 -]]", "", .) %>%
    gsub(" {2,}", " ", .)   %>%
    iconv(to="ASCII//TRANSLIT")    %>%
    textclean::replace_non_ascii  %>%
    stringr::str_to_lower()
}

cleanWords <- function(phrase)  {
  # cleanWords <- function(phrase, changecase = c("no", "upper", "lower", "title"))  {
  phrase %>% 
    # gsub("[^[:alnum:]]", "", .) %>%
    gsub("[^[A-Za-z0-9 -]]", "", .) %>%
    gsub(" {2,}", " ", .) 
  # %>%
  # iconv(to="ASCII//TRANSLIT")     
  # replace_non_ascii
  # str_to_lower()
  # ifelse(changecase == "upper", str_to_upper(),
  #        ifelse (changecase == "lower", str_to_lower(),
  #                ifelse(changecase == "title", str_to_title(),
  #                       .)
  #        )
  # )
}

arrayTranslitFromCyrilic <- function (slovo="Дмитрий Городничий") {
  c(
    slovo %>% stri_trans_general("ukrainian-Latin/bgn")%>% cleanWords,
    slovo %>% stri_trans_general("Russian-Latin/bgn")%>% cleanWords,
    slovo %>% stri_trans_general("Bulgarian-Latin/bgn")%>% cleanWords,
    slovo %>% stri_trans_general("Belarusian-Latin/bgn")%>% cleanWords,
    slovo %>% stri_trans_general("Serbian-Latin/bgn")%>% cleanWords,
    slovo %>% stri_trans_general("cyrillic-latin")%>% cleanWords
  )
}