################################################################## ### --- R script "Concordancing with R exemplified ### --- by retrieving all instances of 'like' from the Santa Barbara ### --- Corpus of American English " ### --- Author: Martin Schweinberger (January 8th, 2014) ### --- R-Version: R version 3.0.1 (2013-05-16) -- "Good Sport" ### --### --- This script was written by Martin Schweinberger ### --- (<http://www.martinschweinberger.de/blog/>). ### --- It extracts concordances from the Santa Barbara Corpus ### --- of American English (SBCAE) ### --- (<http://www.linguistics.ucsb.edu/research/santa-barbara-corpus>). ### --- In order for this skript to work you need to have access to ### --- the TRN files of the SBCAE. ### --- This script is made available under the GNU General Public License ### --- <http://www.gnu.org/licenses/gpl.html>. ### --- If you use it, PLEASE QUOTE it as: ### --- Schweinberger, Martin. 2014. Concordancing with R exemplified ### --- by retrieving all instances of 'like' from the Santa Barbara ### --- Corpus of American English. Unpublished R skript. Leuphana ### --- University Luneburg. ### --- THANK YOU. Copyright Martin Schweinberger (2014). ################################################################## ################################################################## ### --- START ################################################################## # Remove all lists from the current workspace rm(list=ls(all=T)) # Load necessary libraries library(plyr) library(data.table) library(stringr) ### --- Setting parameters # Determine the search pattern search.pattern = c("[L|l]ike") # # # # # WARNING: To use this script you need to set our own paths! Your path should be the path to the corpus on your own computer! Remember to use double backslash instead of single backslash, if you use Windows on your maschine. The outputhpath should be the location where you would like to store the final biodata data set. # Specify pathname of the corpus pathname = "C:\\PhD\\skripts n data\\corpora\\SBCAE\\corpusdata\\TRN" # Define outputpath outputpath <- "C:\\PhD\\skripts n data/kwik sbc raw.txt" # Determine range range = 20 ############################################################### # Load files you would like to search directly # by specifying the path # 1) Load file IDs corpus.files = list.files(path = pathname, pattern = NULL, all.files = T, full.names = T, recursive = T, ignore.case = T, include.dirs = T) # 2) Load files corpus.files.names = list.files(path = pathname, pattern = NULL, all.files = T, full.names = F, recursive = T, ignore.case = T, include.dirs = T) ############################################################### # Tokenize the corpus files corpus.tmp <- sapply(corpus.files, function(x) { x <- scan(x, what = "char", sep = "\n", quiet = T) x <- gsub("\t" , " ", x, perl= T, fixed = T) x <- gsub(" {2,}" , " ", x) x <- str_trim(x, side = "both") x <- unlist(strsplit(x, " ")) } ) # Extract the positions of the tokens concordance.index <- sapply(corpus.tmp, function(x) { x <- grep(search.pattern, x, ignore.case = T, value = F) } ) # Extract the tokens #corpus.tmp[[1]][[concordance.index[[1]][1]]] #corpus.tmp[[1]][[concordance.index[[1]][2]]] ############################################################### # Extract tokens token.orig.raw <- sapply(corpus.tmp, function(x) { x[grep(search.pattern, x, ignore.case = T, value = F)] } ) #str(token.orig.raw) #token.orig.raw text.id.raw <- as.vector(unlist(sapply(names(token.orig.raw), function(x) { x <- gsub(".*/", "", x) x <- gsub("\\..*", "", x) x <- gsub("\\..*", "", x) } ))) len <- as.vector(unlist(sapply(token.orig.raw, function(x) { x <- length(x)} ))) text.id <- rep(text.id.raw, len) #text.id token.orig <- as.vector(unlist(token.orig.raw)) #token.orig ############################################################### # Extract previous 20 elements pre.orig <- as.vector(unlist(sapply(corpus.tmp, function(x) { position.of.matches <- grep(search.pattern, x, ignore.case = T, value = F) sapply(position.of.matches, function(y){ start <- max(1, as.numeric(y)-range) stop <- as.numeric(y)-1 z <- x[start:stop] z <- paste(z, collapse = " ") } ) } ))) #str(pre.orig) #pre.orig # Extract all previous context pre.all.orig <- as.vector(unlist(sapply(corpus.tmp, function(x) { position.of.matches <- grep(search.pattern, x, ignore.case = T, value = F) sapply(position.of.matches, function(y){ z <- x[1 : y-1] z <- paste(z, collapse = " ") } ) } ))) #str(pre.all.orig) #pre.all.orig # Extract all following context post.orig <- as.vector(unlist(sapply(corpus.tmp, function(x) { position.of.matches <- grep(search.pattern, x, ignore.case = T, value = F) sapply(position.of.matches, function(y){ end <- y+ range z <- x[(y+1):end] z <- paste(z, collapse = " ") } ) } ))) #str(post.orig) #post.orig # Create a vector out of the original corpus material surrounding the match orig.tb <- matrix(cbind(pre.orig, rep("<<", length(pre.orig)), token.orig, rep(">>", length(pre.orig)), post.orig), ncol = 5) orig <- apply(orig.tb, 1, paste, collapse = " ") #head(orig) ############################################################### # Clean preceeding content pre.tmp <- as.vector(unlist(sapply(pre.orig, function(x) { sapply(x, function(y) { y <- gsub("\t", "", y, fixed = T) y <- gsub("(\\([A-Z]{1,5}\\))", "", y) y <- gsub("(X{1,5}[0-9]\\])", "", y) y <- gsub("(<X|X>)", "", y) y <- gsub("([0-9]{1,8})", "", y) y <- gsub("(\\{|\\}|\\[|\\]|\\(|\\)|~|,|=|-|%|/)", "", y) y <- gsub("(\\. {1,}\\.)", "..", y) y <- str_trim(y, side = "both") y <- gsub(" {2,}", " ", y) } ) } ))) # Extract all content preceding a match content pre.all <- as.vector(unlist(sapply(pre.all.orig, function(x) { sapply(x, function(y) { y <- gsub("\t", "", y, fixed = T) y <- gsub("(\\([A-Z]{1,5}\\))", "", y) y <- gsub("(X{1,5}[0-9]\\])", "", y) y <- gsub("(<X|X>)", "", y) y <- gsub("([0-9]{1,8})", "", y) y <- gsub("(\\{|\\}|\\[|\\]|\\(|\\)|~|,|=|-|%|/)", "", y) y <- gsub("(\\. {1,}\\.)", "..", y) y <- str_trim(y, side = "both") y <- gsub(" {2,}", " ", y) } ) } ))) # Clean matches token.tmp <- as.vector(unlist(sapply(token.orig, function(x) { sapply(x, function(y) { y <- gsub("\t", "", y, fixed = T) y <- gsub("(\\([A-Z]{1,5}\\))", "", y) y <- gsub("(X{1,5}[0-9]\\])", "", y) y <- gsub("(<X|X>)", "", y) y <- gsub("([0-9]{1,8})", "", y) y <- gsub("(\\{|\\}|\\[|\\]|\\(|\\)|~|,|=|-|%|/)", "", y) y <- gsub("(\\. {1,}\\.)", "..", y) y <- str_trim(y, side = "both") y <- gsub(" {2,}", " ", y) } ) } ))) #Extract speakers who uttered the match spk.ref.tmp1 <- matrix(cbind(pre.all, token.tmp), ncol = 2) spk.ref.tmp2 <- apply(spk.ref.tmp1, 1, paste, collapse = " ") spk.ref <- as.vector(unlist(sapply(spk.ref.tmp2, function(x) { x <- gsub("([A-Z]{2,}[0-9]{0,1}:)", "#~#\\1", x) x <- gsub(".*#~#", "", x) x <- gsub(":.*", "", x) } ))) #head(spk.ref) # Clean subsequent content post.tmp <- as.vector(unlist(sapply(post.orig, function(x) { sapply(x, function(y) { y <- gsub("\t", "", y, fixed = T) y <- gsub("(\\([A-Z]{1,5}\\))", "", y) y <- gsub("(X{1,5}[0-9]\\])", "", y) y <- gsub("(<X|X>)", "", y) y <- gsub("([0-9]{1,8})", "", y) y <- gsub("(\\{|\\}|\\[|\\]|\\(|\\)|~|,|=|-|%|/)", "", y) y <- gsub("(\\. {1,}\\.)", "..", y) y <- str_trim(y, side = "both") y <- gsub(" {2,}", " ", y) } ) } ))) # Create a vector out of the clean corpus material surrounding the match test.tb <- matrix(cbind(pre.tmp, rep("<<", length(pre.tmp)), token.tmp, rep(">>", length(pre.tmp)), post.tmp), ncol = 5) test <- apply(test.tb, 1, paste, collapse = " ") test <- gsub(" {2,}", " ", test) test <- str_trim(test, side = "both") #head(test) ############################################################### # Create a table of the extracted infromation kwik.tmp1 <- cbind(1:length(text.id), text.id, spk.ref, pre.tmp, token.tmp, post.tmp, orig, test) colnames(kwik.tmp1) <- c("id", "text.id","spk.ref", "previous element(s)","token","following element(s)", "orig.data", "test.column") #length(kwik.tmp1[, 1]) #head(kwik.tmp1) # Find hit <x <x <- non matches 1 as.vector(unlist(sapply(kwik.tmp1[, 5], function(x) { tolower(x) gsub(".*:", "", x) x <- gsub("@|\\?|\\.|b_", "", x) } ))) hit <- as.vector(unlist(sapply(hit, function(x) { ifelse(x == "like", "like", "0") } ))) kwik.tmp2 <- cbind(kwik.tmp1, hit) kwik.tmp2 <- kwik.tmp2[!kwik.tmp2[, 9] == "0", ] #head(kwik.tmp2) #length(kwik.tmp2[, 1]) # Find non matches 2 hit <- as.vector(unlist(sapply(kwik.tmp2[, 8], function(x) { x <- tolower(x) x <- gsub("(\\.)", "", x) #remove punctuation x <- gsub("( {2,})", " ", x) #remove extra spaces x <- gsub(".*thing << like >> th.*", "0", x) #thing like that x <- gsub(".*stuff << like >> th.*", "0", x) #stuff like that x <- gsub(".*ould << like >>.*", "0", x) #would like x <- gsub(".*<< like >> to .*", "0", x) # like to x <- gsub(".*n't<< like >> .*", "0", x) # don't like x <- gsub(".*n’t<< like >> .*", "0", x) # don't like x <- gsub(".*taste[s]{0,1} << like >> .*", "0", x) # taste like x <- gsub(".*smell[s]{0,1} << like >> .*", "0", x) # smell like x <- gsub(".*feel[s]{0,1} << like >> .*", "0", x) # feel like x <- gsub(".*sound[s]{0,1} << like >> .*", "0", x) # sound like x <- gsub(".*look[s]{0,1} << like >> .*", "0", x) # look like x <- gsub(".*seem[s]{0,1} << like >> .*", "0", x) # seem like x <- gsub(".* it {0,1}'s << like >> .*", "0", x) # it's like } ))) # Remove rows witjout proper hit kwik.tmp3 <- cbind(kwik.tmp2[, 1:8], hit) kwik.tmp3 <- kwik.tmp3[kwik.tmp3[, 9] != "0", ] #head(kwik.tmp3) #length(kwik.tmp3[, 1]) # Rename final kwik kwik.sbc.raw <- kwik.tmp3 # Inspect resulting kwik #kwik.sbc.raw #head(kwik.sbc.raw) #kwik.sbc.raw[, 9] ############################################################### ############################################################### ############################################################### ### --- Important objects #kwik.sbc.raw #head(kwik.sbc.raw) ############################################################### ############################################################### ############################################################### ############################################################### # Save results in a txt file # Choose a file in which to store the results output.file <- file.create(outputpath, showWarnings = F) # Store the txt file in the output file write.table(kwik.sbc.raw, outputpath, sep = "\t", row.names = F) ############################################################### # Remove all lists from the current workspace #rm(list=ls(all=T)) ############################################################### ############################################################### ### --THE END ############################################################### ###############################################################