19 Text Analysis in R
https://gccollab.ca/discussion/view/7404441/text-analysis-in-r
19.1 Open source textbook
https://smltar.com/ https://www.tidytextmining.com/ https://slcladal.github.io/topicmodels.html, https://slcladal.github.io/textanalysis.html
Also, a list of related resources and codes for text mining (including Web Scraping) are collected here: https://github.com/gorodnichy/LA-R-text
19.2 Plagiarism detection
Q: Any ideas/packages/resources (in R) for plagiarism detection?
A: A good place to start is the “stylo” package (https://github.com/computationalstylistics/stylo - R package for stylometric analyses) which implements a wide variety of recent research in computational stylistics. Plagiarism detection is fraught (insert all of the usual ethical and computational caveats…), but stylo can help you identify passages that are stylistically unusual compared to the rest of the text. Unusualness definitely isn’t a proxy for plagiarism, but it’s a good place to start.
Q: Is this focused on English language text? Are there lexicons or libraries for comparison within other languages (e.g., French)?
A: Stylo works well with quite a few non-English languages. French, for example, is supported, as are a number of languages with non-Latin alphabets like Arabic and Korean.
19.4 Useful code snippets
19.4.1 Basic cleaning : Remove accents (benchmarking)
dtCases <- fread("https://github.com/ishaberry/Covid19Canada/raw/master/cases.csv", stringsAsFactors = F )
dtCases %>% dim
system.time(dtCases [, city0 := health_region])
system.time(dtCases [, city1 := base::iconv (health_region, from="UTF-8", to="ASCII//TRANSLIT")])
system.time(dtCases [, city2 := textclean::replace_non_ascii (health_region)])
system.time(dtCases [, city3 := stringi::stri_trans_general (health_region,id = "Latin-ASCII")])
dtCases[city0!=city1, city0:city3] %>% unique
microbenchmark::microbenchmark(
dtCases [, city0 := iconv (health_region, to="ASCII//TRANSLIT")],
dtCases [, city1 := iconv (health_region, to="ASCII//TRANSLIT")],
dtCases [, city2 := textclean::replace_non_ascii (health_region)],
dtCases [, city3 := stringi::stri_trans_general (health_region,id = "Latin-ASCII")],
times=10)
# Unit: milliseconds
# expr min lq
# dtCases[, `:=`(city0, iconv(health_region, to = "ASCII//TRANSLIT"))] 166.8094 168.8169
# dtCases[, `:=`(city1, iconv(health_region, to = "ASCII//TRANSLIT"))] 165.9741 168.5937
# dtCases[, `:=`(city2, textclean::replace_non_ascii(health_region))] 8757.1358 8867.7838
# dtCases[, `:=`(city3, stringi::stri_trans_general(health_region, id = "Latin-ASCII"))] 4204.2102 4230.9790
# mean median uq max neval
# 172.8043 172.4714 174.7670 181.1422 10
# 173.8419 171.0345 172.6113 204.4243 10
# 9088.6954 9049.1962 9273.6495 9545.8301 10
# 4301.7987 4293.4896 4339.4618 4430.2948 10
19.4.2 Text cleaning
library(textclean)
# library(textshape) #with(DATA, split_portion(state, n.words = 10))
# https://github.com/trinker/textclean
if(F) {
mgsub(textclean::DATA$state, c("i", "it"), c("<<I>>", "[[IT]]"))
mgsub(DATA$state, "[[:punct:]]", "<<PUNCT>>", fixed = FALSE)
x <- c(
"<bold>Random</bold> text with symbols: < > & " '",
"<p>More text</p> ¢ £ ¥ € © ®",
"Welcome to A I: the best W O R L D!",
"6 Ekstr\xf8m", "J\xf6reskog", "bi\xdfchen Z\xfcrcher",
"I'm liiiike whyyyyy me?", "Wwwhhatttt!"
)
replace_html(x)
replace_kern(x)
x; Encoding(x) <- "latin1"; x
replace_non_ascii(x)
# Tokens
## Set Up the Tokens to Replace
lexicon::grady_augmented # 122806 #English words
lexicon::common_names # 5493 #Names
nms <- gsub("(^.)(.*)", "\\U\\1\\L\\2", lexicon::common_names, perl = TRUE)
head(nms)
fuzzyjoin::misspellings %>% nrow() #4505
replace_white(x)
replace_word_elongation(x)
}
cleanWords <- function(phrase) {
# cleanWords <- function(phrase, changecase = c("no", "upper", "lower", "title")) {
phrase %>%
# gsub("[^[:alnum:]]", "", .) %>%
gsub("[^[A-Za-z0-9 -]]", "", .) %>%
gsub(" {2,}", " ", .)
# %>%
# iconv(to="ASCII//TRANSLIT")
# replace_non_ascii
# str_to_lower()
# ifelse(changecase == "upper", str_to_upper(),
# ifelse (changecase == "lower", str_to_lower(),
# ifelse(changecase == "title", str_to_title(),
# .)
# )
# )
}
arrayTranslitFromCyrilic <- function (slovo="Дмитрий Городничий") {
c(
slovo %>% stri_trans_general("ukrainian-Latin/bgn")%>% cleanWords,
slovo %>% stri_trans_general("Russian-Latin/bgn")%>% cleanWords,
slovo %>% stri_trans_general("Bulgarian-Latin/bgn")%>% cleanWords,
slovo %>% stri_trans_general("Belarusian-Latin/bgn")%>% cleanWords,
slovo %>% stri_trans_general("Serbian-Latin/bgn")%>% cleanWords,
slovo %>% stri_trans_general("cyrillic-latin")%>% cleanWords
)
}
if (F) {
# library(stringi)
dt <- stri_trans_list() %>% data.table () %>% setnames("coding")#
dt[coding %ilike% "Latin"][]
dt[coding %ilike% "cyr"]
dt[coding %ilike% "ascii"]
dt1 <- iconvlist() %>% data.table () %>% setnames("coding")#
dt1[coding %ilike% "ascii"]
dt1[coding %ilike% "ASCII"]
dt1[coding %ilike% "TRANSLIT"]
"Montréal" %>% iconv(to="ASCII//TRANSLIT")
c("шерсть","женщина", "челюсть", "Володимир Зеленський", "Дмитрий Городничий") %>%
sapply( translitFromCyrilic )
}
19.4.3 Extracting, re-ordering words in a string
https://stackoverflow.com/questions/55244680/extract-words-in-between-two-commas-in-r
word <- 'Fu Tien Mansion, Taikoo Shing, Hong Kong'
# and I want to extract the word in between the two commas and concatenate it with the first word, what regex to use?
# to get this: 'Taikoo Shing Fu Tien Mansion'
# 1: sub ----
sub("^([^,]+),\\s*([^,]+),.*", "\\2 \\1", word)
#[1] "Taikoo Shing Fu Tien Mansion"
# 2: -----
x <- base::strsplit(word, ",")[[1]]
paste(x[2], x[1])
# 3 -----
paste(
trimws( # Remove Leading/Trailing Whitespace
sapply(strsplit(word,","), `[`, 2)
),
trimws(sapply(strsplit(word,","), `[`, 1))
)
19.4.4 Automatically finding / removing common parts in strings
https://stackoverflow.com/questions/48701107/find-length-of-overlap-in-strings
findStrOverlap <- function(str1, str2, ignore.case = FALSE) { # , verbose = FALSE
if(ignore.case) {
str1 <- tolower(str1); str2 <- tolower(str2)
}
if(nchar(str1) < nchar(str2)) {
x <- str2; str2 <- str1; str1 <- x
}
x <- strsplit(str2, "")[[1L]]
n <- length(x)
s <- sequence(seq_len(n))
s <- split(s, cumsum(s == 1L))
s <- rep(list(s), n)
for(i in seq_along(s)) {
s[[i]] <- lapply(s[[i]], function(x) {
x <- x + (i-1L)
x[x <= n]
})
s[[i]] <- unique(s[[i]])
}
s <- unlist(s, recursive = FALSE)
s <- unique(s[order(-lengths(s))])
i <- 1L
len_s <- length(s)
while(i < len_s) {
lcs <- paste(x[s[[i]]], collapse = "")
# if(verbose) cat("now checking:", lcs, "\n")
check <- grepl(lcs, str1, fixed = TRUE)
if(check) {
# if(verbose) cat(paste0("Found: '",lcs,"' (length =", nchar(lcs), ") \n"))
break
} else {
i <- i + 1L
}
}
return (lcs)
}
library(data.table)
dt <- cansim::get_cansim("13-10-0810-01") %>% setDT(dt)
dt <- data.table::data.table(
GEO=c( # From CANSIM Table
"Newfoundland and Labrador, place of occurrence",
"Prince Edward Island, place of occurrence",
"Nova Scotia, place of occurrence"
))
aStr <- dt$GEO
removeCommonStrPart <- function(aStr) {
str0 <- findStrOverlap( aStr[1], aStr[2]); str0
str_replace(aStr, str0, "")
}
dt[, GEO:=removeCommonStrPart(GEO)][]
# GEO
# <char>
#1: Newfoundland and Labrador
#2: Prince Edward Island
#3: Nova Scotia
19.4.5 Useful packages
stringdist
stringdist::stringdist(aStr[1], aStr[2], method="lcs") # 29
stringdist::stringdistmatrix(dt$GEO, dt$GEO, method="lcs") %>% as.data.table
stringdist::stringdist(aStr[1], aStr[2], method='qgram',q=2) # 37
19.4.5.1 Not as useful ….
tidystringdist
- converts stringdist results into tibble. not optimized, very slow
Better to use write code yourself to do that using data.table !
19.4.5.2 Language Detection and converting
https://stackoverflow.com/questions/8078604/detect-text-language-in-r
# 2.1 ----
library(textcat)
# The textcat Package for n-Gram Based Text Categorization in R. Journal of Statistical Software, http://www.jstatsoft.org/v52/i06/
# 2.2 cld2 - best + archived cran ----
library(cld2) # the fastest of all
# library(cld3) #
if( F) {
url <- "http://cran.us.r-project.org/src/contrib/Archive/cldr/cldr_1.1.0.tar.gz"
pkgFile<-"cldr_1.1.0.tar.gz"
download.file(url = url, destfile = pkgFile)
install.packages(pkgs=pkgFile, type="source", repos=NULL)
unlink(pkgFile)
# or devtools::install_version("cldr",version="1.1.0")
#usage
library(cldr)
demo(cldr)
}
# The cldr package in a previous answer is not any more available on CRAN and may be difficult to install. However, Google's (Chromium's) cld libraries are now available in R through other dedicated packages, cld2 and cld3.
# 2.3 ----
# An approach in R would be to keep a text file of English words. I have several of these including one from http://www.sil.org/linguistics/wordlists/english/. After sourcing the .txt file you can use this file to match against each tweet. Something like:
lapply(tweets, function(x) EnglishWordComparisonList %in% x)
# 2.9 benchmarking - data! ----
data(reuters, package = "kernlab") # a corpus of articles in english
reuters
length(reuters)
# [1] 40
sapply(reuters ,nchar)
# [1] 1311 800 511 2350 343 388 3705 604 254 239 632 607 867 240
# [15] 234 172 538 887 2500 1030 538 2681 338 402 563 2825 2800 947
# [29] 2156 2103 2283 604 632 602 642 892 1187 472 1829 367
text <- unlist(reuters)
microbenchmark::microbenchmark(
textcat = textcat::textcat(text),
cld2 = cld2::detect_language(text),
cld3 = cld3::detect_language(text),
detect_from_sw = detect_from_sw(text,c("english","french","german")),
times=10)
# Text analysis ----
# . reading / shaping text ----
# ???
library(textshape)
# https://github.com/trinker/textshape
# textshape is small suite of text reshaping and restructuring functions. Many of these functions are descended from tools in the qdapTools package. This brings reshaping tools under one roof with specific functionality of the package limited to text reshaping.
# This package is meant to be used jointly with the textclean package, which provides cleaning and text normalization functionality. Additionally, the textreadr package is designed to import various common text data sources into R for reshaping and cleaning.
library(textreadr)
# pdf_doc <- system.file("docs/rl10075oralhistoryst002.pdf", package = "textreadr")
# html_doc <- system.file('docs/textreadr_creed.html', package = "textreadr")
#
#Some other implementations of text readers in R:
# tm
# readtext
'https://github.com/trinker/textreadr/raw/master/inst/docs/pres.deb1.docx' %>%
download() %>%
read_docx() %>%
head(3)
# > Plagiarism detection ----
# https://cran.r-project.org/web/packages/RNewsflow/vignettes/RNewsflow.html
library(RNewsflow)
#it also installs
library(fastmatch)
library(quanteda)
rnewsflow_dfm
# https://cran.r-project.org/web/packages/corpustools/vignettes/corpustools.html
library(corpustools)
# corpustools: Managing, Querying and Analyzing Tokenized Text
# Provides text analysis in R, focusing on the use of a tokenized text format.
# syuzhet: Extracts Sentiment and Sentiment-Derived Plot Arcs from Text
library(syuzhet)
# 5 > Web crawling ----
# fuzzyjoin::misspellings ------
## Not run:
library(rvest)
library(readr)
library(dplyr)
library(stringr)
library(tidyr)
u <- "https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines"
h <- read_html(u)
misspellings <- h %>%
html_nodes("pre") %>%
html_text() %>%
readr::read_delim(col_names = c("misspelling", "correct"), delim = ">",
skip = 1) %>%
mutate(misspelling = str_sub(misspelling, 1, -2)) %>%
unnest(correct = str_split(correct, ", ")) %>%
filter(Encoding(correct) != "UTF-8")
## End(Not run)
## selectr
# https://cran.r-project.org/web/packages/selectr/
# https://sjp.co.nz/projects/selectr
#
# selectr: Translate CSS Selectors to XPath Expressions
# Translates a CSS3 selector into an equivalent XPath expression. This allows us to use CSS selectors when working with the XML package as it can only evaluate XPath expressions. Also provided are convenience functions useful for using CSS selectors on XML nodes. This package is a port of the Python package 'cssselect' (<https://cssselect.readthedocs.io/>).
#
# Suggests: testthat, XML, xml2
# Reverse imports: cliapp, ganalytics, Rcrawler, rvest, wikilake
## Rcrawler
# Rcrawler: Web Crawler and Scraper
# Performs parallel web crawling and web scraping. It is designed to crawl, parse and store web pages to produce data that can be directly used for analysis application.
# https://cran.r-project.org/web/packages/Rcrawler
# https://github.com/salimk/Rcrawler/
# RCrawler: An R package for parallel web crawling and scraping, https://www.sciencedirect.com/science/article/pii/S2352711017300110?via%3Dihub
#
# Version: 0.1.9-1
# Imports: httr, xml2, data.table, foreach, doParallel, parallel, selectr, webdriver, callr, jsonlite
# Published: 2018-11-11
#
# https://cran.r-project.org/web/packages/xml2/index.html
19.4.6 cleanText(text)
: clean text
19.4.6.1 Data cleaning Tasks:
- Stripping non relevant symbols, including spaces
- Contractions and Incomplete words
- abbreviations and slang
- Kerning (“B O M B!”)
- Elongations (looong)
- converting from Non-ASCII, removing accents
- cleaning Numbers: 99,999; 99.999; 99.99%; 99,99%; 2nd
- normalizing words to a common form (eg Title)
- fixing common typos
19.4.6.2 Example 1:
x <- c("i like", "<p>i want. </p>. thet them ther .", "I am ! that|", "", NA,
""they" they,were there", ".", " ", "?", "3;", "I like goud eggs!",
"bi\xdfchen Z\xfcrcher", "i 4like...", "\\tgreat", "She said \"yes\"")
Encoding(x) <- "latin1"
x <- as.factor(x)
renderPrint(x)
### Problem diagnosis
check_text(x)
19.4.7 Convert Text to Date or Timestamp
text2date <- function(a) {
a <- a %>% str_replace("XII", "12") %>%
str_replace("XI", "11")%>%
str_replace("IX", "9")%>%
str_replace("X", "10")%>%
str_replace("VIII", "8")%>%
str_replace("VII", "7")%>%
str_replace("VI", "6")%>%
str_replace("IV", "4")%>%
str_replace("V", "5")%>%
str_replace("III", "3")%>%
str_replace("II", "2")%>%
str_replace("I", "1")
x <- dmy(a);
if (is.na(x)) x <- dym(a)
if (is.na(x)) x <- ymd(a)
if (is.na(x)) x <- ydm(a)
if (is.na(x)) x <- mdy(a)
if (is.na(x)) x <- myd(a)
if (is.na(x)) x <- ymd_hms(a)
if (year(x)>3000) x <- x - years(100)
return (x)
}
# text2dtDate
text2dtYYMMDD <- function(a, text=F, date=F, timestamp=F) {
x <- text2date(a)
dt <- data.table(YY=year(x) %>% as.integer(), MM=month(x)%>% as.integer(), DD=day(x)%>% as.integer() )
# cols <- 1:ncol(dt)
# dt[, (cols):=lapply(.SD, as.integer), .SDcols=cols]
if (text) dt %<>% cbind(data.table(text=a))
if (date) dt %<>% cbind(data.table(date=x))
if (timestamp) dt %<>% cbind(data.table(timestamp=now("EST")))
return(dt)
}
text2timestamp <- function (a) {
x <- ymd_hms (a);
if (is.na(x)) x <- ymd_hm (a)
if (is.na(x)) x <- ymd_h (a)
if (is.na(x)) x <- dmy_hms (a)
if (is.na(x)) x <- dmy_hm (a)
if (is.na(x)) x <- dmy_h (a)
if (is.na(x)) x <- mdy_hms (a)
if (is.na(x)) x <- mdy_hm (a)
if (is.na(x)) x <- mdy_h (a)
if (is.na(x)) x <- ydm_hms (a)
if (is.na(x)) x <- ydm_hm (a)
if (is.na(x)) x <- ydm_h (a)
if (is.na(x)) x <- text2date (a)
return(x)
}
19.4.8 Transliteration & cleaning
cleanPhrase <- function(phrase) {
phrase %>%
gsub("[^[:alnum:]]", "", .) %>%
gsub("[^[A-Za-z0-9 -]]", "", .) %>%
gsub(" {2,}", " ", .) %>%
iconv(to="ASCII//TRANSLIT") %>%
textclean::replace_non_ascii %>%
stringr::str_to_lower()
}
cleanWords <- function(phrase) {
# cleanWords <- function(phrase, changecase = c("no", "upper", "lower", "title")) {
phrase %>%
# gsub("[^[:alnum:]]", "", .) %>%
gsub("[^[A-Za-z0-9 -]]", "", .) %>%
gsub(" {2,}", " ", .)
# %>%
# iconv(to="ASCII//TRANSLIT")
# replace_non_ascii
# str_to_lower()
# ifelse(changecase == "upper", str_to_upper(),
# ifelse (changecase == "lower", str_to_lower(),
# ifelse(changecase == "title", str_to_title(),
# .)
# )
# )
}
arrayTranslitFromCyrilic <- function (slovo="Дмитрий Городничий") {
c(
slovo %>% stri_trans_general("ukrainian-Latin/bgn")%>% cleanWords,
slovo %>% stri_trans_general("Russian-Latin/bgn")%>% cleanWords,
slovo %>% stri_trans_general("Bulgarian-Latin/bgn")%>% cleanWords,
slovo %>% stri_trans_general("Belarusian-Latin/bgn")%>% cleanWords,
slovo %>% stri_trans_general("Serbian-Latin/bgn")%>% cleanWords,
slovo %>% stri_trans_general("cyrillic-latin")%>% cleanWords
)
}