R script "Concordancing with R exemplified ### --

advertisement
##################################################################
### --- R script "Concordancing with R exemplified
### --- by retrieving all instances of 'like' from the Santa Barbara
### --- Corpus of American English "
### --- Author: Martin Schweinberger (January 8th, 2014)
### --- R-Version: R version 3.0.1 (2013-05-16) -- "Good Sport"
### --### --- This script was written by Martin Schweinberger
### --- (<http://www.martinschweinberger.de/blog/>).
### --- It extracts concordances from the Santa Barbara Corpus
### --- of American English (SBCAE)
### --- (<http://www.linguistics.ucsb.edu/research/santa-barbara-corpus>).
### --- In order for this skript to work you need to have access to
### --- the TRN files of the SBCAE.
### --- This script is made available under the GNU General Public License
### --- <http://www.gnu.org/licenses/gpl.html>.
### --- If you use it, PLEASE QUOTE it as:
### --- Schweinberger, Martin. 2014. Concordancing with R exemplified
### --- by retrieving all instances of 'like' from the Santa Barbara
### --- Corpus of American English. Unpublished R skript. Leuphana
### --- University Luneburg.
### --- THANK YOU. Copyright Martin Schweinberger (2014).
##################################################################
##################################################################
### --- START
##################################################################
# Remove all lists from the current workspace
rm(list=ls(all=T))
# Load necessary libraries
library(plyr)
library(data.table)
library(stringr)
### --- Setting parameters
# Determine the search pattern
search.pattern = c("[L|l]ike")
#
#
#
#
#
WARNING: To use this script you need to set our own paths!
Your path should be the path to the corpus on your own computer!
Remember to use double backslash instead of single backslash, if
you use Windows on your maschine. The outputhpath should be the
location where you would like to store the final biodata data set.
# Specify pathname of the corpus
pathname = "C:\\PhD\\skripts n data\\corpora\\SBCAE\\corpusdata\\TRN"
# Define outputpath
outputpath <- "C:\\PhD\\skripts n data/kwik sbc raw.txt"
# Determine range
range = 20
###############################################################
# Load files you would like to search directly
# by specifying the path
# 1) Load file IDs
corpus.files = list.files(path = pathname, pattern = NULL, all.files = T,
full.names = T, recursive = T,
ignore.case = T, include.dirs = T)
# 2) Load files
corpus.files.names = list.files(path = pathname, pattern = NULL,
all.files = T,
full.names = F, recursive = T,
ignore.case = T, include.dirs = T)
###############################################################
# Tokenize the corpus files
corpus.tmp <- sapply(corpus.files, function(x) {
x <- scan(x, what = "char", sep = "\n", quiet = T)
x <- gsub("\t" , " ", x, perl= T, fixed = T)
x <- gsub(" {2,}" , " ", x)
x <- str_trim(x, side = "both")
x <- unlist(strsplit(x, " ")) } )
# Extract the positions of the tokens
concordance.index <- sapply(corpus.tmp, function(x) {
x <- grep(search.pattern, x, ignore.case = T, value = F)
}
)
# Extract the tokens
#corpus.tmp[[1]][[concordance.index[[1]][1]]]
#corpus.tmp[[1]][[concordance.index[[1]][2]]]
###############################################################
# Extract tokens
token.orig.raw <- sapply(corpus.tmp, function(x) {
x[grep(search.pattern, x, ignore.case = T, value = F)]
} )
#str(token.orig.raw)
#token.orig.raw
text.id.raw <- as.vector(unlist(sapply(names(token.orig.raw), function(x) {
x <- gsub(".*/", "", x)
x <- gsub("\\..*", "", x)
x <- gsub("\\..*", "", x) } )))
len <- as.vector(unlist(sapply(token.orig.raw, function(x) {
x <- length(x)} )))
text.id <- rep(text.id.raw, len)
#text.id
token.orig <- as.vector(unlist(token.orig.raw))
#token.orig
###############################################################
# Extract previous 20 elements
pre.orig <- as.vector(unlist(sapply(corpus.tmp, function(x) {
position.of.matches <- grep(search.pattern, x, ignore.case = T, value = F)
sapply(position.of.matches, function(y){
start <- max(1, as.numeric(y)-range)
stop <- as.numeric(y)-1
z <- x[start:stop]
z <- paste(z, collapse = " ")
} ) } )))
#str(pre.orig)
#pre.orig
# Extract all previous context
pre.all.orig <- as.vector(unlist(sapply(corpus.tmp, function(x) {
position.of.matches <- grep(search.pattern, x, ignore.case = T, value = F)
sapply(position.of.matches, function(y){
z <- x[1 : y-1]
z <- paste(z, collapse = " ")
} ) } )))
#str(pre.all.orig)
#pre.all.orig
# Extract all following context
post.orig <- as.vector(unlist(sapply(corpus.tmp, function(x) {
position.of.matches <- grep(search.pattern, x, ignore.case = T, value = F)
sapply(position.of.matches, function(y){
end <- y+ range
z <- x[(y+1):end]
z <- paste(z, collapse = " ")
} ) } )))
#str(post.orig)
#post.orig
# Create a vector out of the original corpus material surrounding the match
orig.tb <- matrix(cbind(pre.orig, rep("<<", length(pre.orig)), token.orig,
rep(">>", length(pre.orig)), post.orig), ncol = 5)
orig <- apply(orig.tb, 1, paste, collapse = " ")
#head(orig)
###############################################################
# Clean preceeding content
pre.tmp <- as.vector(unlist(sapply(pre.orig, function(x) {
sapply(x, function(y) {
y <- gsub("\t", "", y, fixed = T)
y <- gsub("(\\([A-Z]{1,5}\\))", "", y)
y <- gsub("(X{1,5}[0-9]\\])", "", y)
y <- gsub("(<X|X>)", "", y)
y <- gsub("([0-9]{1,8})", "", y)
y <- gsub("(\\{|\\}|\\[|\\]|\\(|\\)|~|,|=|-|%|/)", "", y)
y <- gsub("(\\. {1,}\\.)", "..", y)
y <- str_trim(y, side = "both")
y <- gsub(" {2,}", " ", y) } ) } )))
# Extract all content preceding a match content
pre.all <- as.vector(unlist(sapply(pre.all.orig, function(x) {
sapply(x, function(y) {
y <- gsub("\t", "", y, fixed = T)
y <- gsub("(\\([A-Z]{1,5}\\))", "", y)
y <- gsub("(X{1,5}[0-9]\\])", "", y)
y <- gsub("(<X|X>)", "", y)
y <- gsub("([0-9]{1,8})", "", y)
y <- gsub("(\\{|\\}|\\[|\\]|\\(|\\)|~|,|=|-|%|/)", "", y)
y <- gsub("(\\. {1,}\\.)", "..", y)
y <- str_trim(y, side = "both")
y <- gsub(" {2,}", " ", y) } ) } )))
# Clean matches
token.tmp <- as.vector(unlist(sapply(token.orig, function(x) {
sapply(x, function(y) {
y <- gsub("\t", "", y, fixed = T)
y <- gsub("(\\([A-Z]{1,5}\\))", "", y)
y <- gsub("(X{1,5}[0-9]\\])", "", y)
y <- gsub("(<X|X>)", "", y)
y <- gsub("([0-9]{1,8})", "", y)
y <- gsub("(\\{|\\}|\\[|\\]|\\(|\\)|~|,|=|-|%|/)", "", y)
y <- gsub("(\\. {1,}\\.)", "..", y)
y <- str_trim(y, side = "both")
y <- gsub(" {2,}", " ", y)
} ) } )))
#Extract speakers who uttered the match
spk.ref.tmp1 <- matrix(cbind(pre.all, token.tmp), ncol = 2)
spk.ref.tmp2 <- apply(spk.ref.tmp1, 1, paste, collapse = " ")
spk.ref <- as.vector(unlist(sapply(spk.ref.tmp2, function(x) {
x <- gsub("([A-Z]{2,}[0-9]{0,1}:)", "#~#\\1", x)
x <- gsub(".*#~#", "", x)
x <- gsub(":.*", "", x) } )))
#head(spk.ref)
# Clean subsequent content
post.tmp <- as.vector(unlist(sapply(post.orig, function(x) {
sapply(x, function(y) {
y <- gsub("\t", "", y, fixed = T)
y <- gsub("(\\([A-Z]{1,5}\\))", "", y)
y <- gsub("(X{1,5}[0-9]\\])", "", y)
y <- gsub("(<X|X>)", "", y)
y <- gsub("([0-9]{1,8})", "", y)
y <- gsub("(\\{|\\}|\\[|\\]|\\(|\\)|~|,|=|-|%|/)", "", y)
y <- gsub("(\\. {1,}\\.)", "..", y)
y <- str_trim(y, side = "both")
y <- gsub(" {2,}", " ", y)
} ) } )))
# Create a vector out of the clean corpus material surrounding the match
test.tb <- matrix(cbind(pre.tmp, rep("<<", length(pre.tmp)), token.tmp,
rep(">>", length(pre.tmp)), post.tmp), ncol = 5)
test <- apply(test.tb, 1, paste, collapse = " ")
test <- gsub(" {2,}", " ", test)
test <- str_trim(test, side = "both")
#head(test)
###############################################################
# Create a table of the extracted infromation
kwik.tmp1 <- cbind(1:length(text.id), text.id, spk.ref, pre.tmp, token.tmp,
post.tmp, orig, test)
colnames(kwik.tmp1) <- c("id", "text.id","spk.ref", "previous
element(s)","token","following element(s)", "orig.data", "test.column")
#length(kwik.tmp1[, 1])
#head(kwik.tmp1)
# Find
hit <x <x <-
non matches 1
as.vector(unlist(sapply(kwik.tmp1[, 5], function(x) {
tolower(x)
gsub(".*:", "", x)
x <- gsub("@|\\?|\\.|b_", "", x)
} )))
hit <- as.vector(unlist(sapply(hit, function(x) {
ifelse(x == "like", "like", "0")
} )))
kwik.tmp2 <- cbind(kwik.tmp1, hit)
kwik.tmp2 <- kwik.tmp2[!kwik.tmp2[, 9] == "0", ]
#head(kwik.tmp2)
#length(kwik.tmp2[, 1])
# Find non matches 2
hit <- as.vector(unlist(sapply(kwik.tmp2[, 8], function(x) {
x <- tolower(x)
x <- gsub("(\\.)", "", x) #remove punctuation
x <- gsub("( {2,})", " ", x) #remove extra spaces
x <- gsub(".*thing << like >> th.*", "0", x) #thing like that
x <- gsub(".*stuff << like >> th.*", "0", x) #stuff like that
x <- gsub(".*ould << like >>.*", "0", x) #would like
x <- gsub(".*<< like >> to .*", "0", x) # like to
x <- gsub(".*n't<< like >> .*", "0", x) # don't like
x <- gsub(".*n’t<< like >> .*", "0", x) # don't like
x <- gsub(".*taste[s]{0,1} << like >> .*", "0", x) # taste like
x <- gsub(".*smell[s]{0,1} << like >> .*", "0", x) # smell like
x <- gsub(".*feel[s]{0,1} << like >> .*", "0", x) # feel like
x <- gsub(".*sound[s]{0,1} << like >> .*", "0", x) # sound like
x <- gsub(".*look[s]{0,1} << like >> .*", "0", x) # look like
x <- gsub(".*seem[s]{0,1} << like >> .*", "0", x) # seem like
x <- gsub(".* it {0,1}'s << like >> .*", "0", x) # it's like
} )))
# Remove rows witjout proper hit
kwik.tmp3 <- cbind(kwik.tmp2[, 1:8], hit)
kwik.tmp3 <- kwik.tmp3[kwik.tmp3[, 9] != "0", ]
#head(kwik.tmp3)
#length(kwik.tmp3[, 1])
# Rename final kwik
kwik.sbc.raw <- kwik.tmp3
# Inspect resulting kwik
#kwik.sbc.raw
#head(kwik.sbc.raw)
#kwik.sbc.raw[, 9]
###############################################################
###############################################################
###############################################################
### --- Important objects
#kwik.sbc.raw
#head(kwik.sbc.raw)
###############################################################
###############################################################
###############################################################
###############################################################
# Save results in a txt file
# Choose a file in which to store the results
output.file <- file.create(outputpath, showWarnings = F)
# Store the txt file in the output file
write.table(kwik.sbc.raw, outputpath, sep = "\t", row.names = F)
###############################################################
# Remove all lists from the current workspace
#rm(list=ls(all=T))
###############################################################
###############################################################
### --THE END
###############################################################
###############################################################
Download