R script for filtering markers in genotypic disequilibrium genot_diseq<-function(inputfile,extracol,alpha) #To cite this script, please cite Scotti I et al. (2013) RaBoT: a rarefaction-by-bootstrap #method to compare genome-wide levels of genetic diversity. Annals of Forest Science 000:000-000 # #function to identify the largest marker set with less than 5% marker pairs in genotypic disequilibrium #inputfile: character. Name of a tab-separated table, with AFLP scores (0/1/NA) in columns and samples in rows. See input format for RaBoT_phenotypes script. # any number of columns can appear before the genotype columns # columns must have headers #extracol: integer. The number of columns appearing before the data columns #alpha: numeric. The (decimal, not percent) threshold for calling a significant genotypic disequilibrium { data.df<-read.table(file=inputfile,header=T) significant_LD<-character() markers_with_signif_LD<-character() nloc<-ncol(data.df)-extracol #counts genotypecontaining columns for (i in (extracol+1):(ncol(data.df)-1)) { for (j in (i+1):ncol(data.df)) { #testing genotypic disequilibrium with Bonferroni correction for multiple testing: if (fisher.test(table(data.df[,i],data.df[,j]))$p.value<(alpha/(nloc*(nloc-1)*0.5)) ) { #writes the list of pairs with significant genotypc disequilibrium significant_LD<-c(significant_LD,paste(names(data.df[i])," & ",names(data.df[j])," : p-value = ", fisher.test(table(data.df[,i],data.df[,j]))$p.value,sep="")) markers_with_signif_LD<-c(markers_with_signif_LD,names(data.df[i]),names(data.df[j])) } } } #gets markers with genotypic diseq, by ascending order of number of pairs lowLDlist<-names(sort(table(markers_with_signif_LD))) #gets the list and number of loci NOT involved in genot. diseq.: noLDlist<names(data.df)[which(is.element(names(data.df)[(extracol+1):ncol(data.df)],lowLDlist)==FALSE)+extracol] noLDnum<-length(noLDlist) #loop to decrease the number of markers, one by one until 100, and record percent genot. diseq. pairs at each round: list_percentLD<-numeric() for (z in length(lowLDlist):1) { lowLDnums<-integer() for (i in 1:z) { lowLDnums<-c(lowLDnums,which(names(data.df)==lowLDlist[i])) } #gets the numbers of the z columns containing the z less LD-ed markers: lowLDnums<-sort(lowLDnums) #getting a marker set with the lowLD z-marker set: data_lowLD.df<-data.df[,c(1:extracol,lowLDnums)] #re-analysing LD: significant_LD<-0 for (i in (extracol+1):(extracol+z-1)) { for (j in (i+1):(extracol+z)) { if (fisher.test(table(data_lowLD.df[,i],data_lowLD.df[,j]))$p.value<(0.05/((z+noLDnum)*((z+noLDnum)-1)/2))) { significant_LD<-significant_LD+1 } } } percentLD<-100*significant_LD/((z+noLDnum)*((z+noLDnum)-1)/2) list_percentLD<-c(list_percentLD,percentLD) maxloc<-z if (percentLD<5) break } list_percentLD.mt<-cbind(seq(length(lowLDlist)+noLDnum,z+noLDnum,-1),list_percentLD) plot(list_percentLD.mt,pch=21,bg="grey",cex=0.75,xlab="Number of markers",ylab="Significant pairs (%)",main="Pairs of markers\nin linkage disequilibrium") #right number: maxloc #re-obtaining data sets: #getting column numbers: lowLDnums<-integer() for (i in 1:maxloc) { lowLDnums<-c(lowLDnums,which(names(data.df)==lowLDlist[i])) } lowLDnums<-sort(lowLDnums) noandlowLDnums<sort(c(lowLDnums,(which(is.element(names(data.df)[(extracol+1):ncol(data.df)],lowLDlist)==FALSE)+extracol) )) data_lowLD.df<-data.df[,c(1:extracol,noandlowLDnums)] #exporting: write.table(file=paste(length(noandlowLDnums),"mark_lowdiseq_",inputfile,sep=""),data_lowLD.df, row.names=FALSE,sep="\t",quote=FALSE) }