#########################################################################################
#
#                              ### BSA SNP Supplies ###
#
#
# This file contains the scripts for the functions that power the BSAsnp() function
# function that  performs BSA using SNP chips with the help of a few user inputs.
# If you are trying to perform BSA, you may have opened
# this file by mistake, please open "BSAsnp.R" instead.
# 
#
#
# Becker, July 2010
#########################################################################################






#########################################################################################
# This section defines the map.graph function for the final mapping and plots.



map.graph <- function(p1,p2,a1,a2,file.stem, plotcM=F){

colvan <- na.omit(new.snp.data[,c(p1,p2)]) # get the relevant columns
rownames(colvan) <- new.snp.coord
sense.mat <- do.call('rbind', strsplit(rownames(sense), "_")) # create a matrix
sense.coord <- paste("X",sense.mat[,1],"_",sense.mat[,2], sep = "") # reformat naming scheme
markers <- colvan[colvan[,1] != colvan[,2],] # get the markers
sense.m <- sense[sense.coord %in% rownames(markers),] # find the sense markers
antisense.m <- antisense[sense.coord %in% rownames(markers),] # the antisense markers have the same index as sense
mark.coord <- as.data.frame(sense.mat[sense.coord %in% rownames(markers),c(1:2)], stringsAsFactors = FALSE)
mark.coord[,1] <- as.numeric(mark.coord[,1])
mark.coord[,2] <- as.numeric(mark.coord[,2])
colnames(mark.coord) <- c("Chr","bp")

#Now make cM like Wolyn, et al
if(plotcM){chr.len <- round(c(96.6, 83.3, 103.9, 64.5, 97.7)) #kosambi map distances
# assume equal recombination rate across chromosome
# assign cM from physical position
chr.bp <- tapply(mark.coord$bp,mark.coord$Chr,max)
cM <- list()
for (i in 1:5) cM[[i]] <- mark.coord$bp[mark.coord$Chr == i]*chr.len[i]/chr.bp[i]
mark.coord$cM <- unlist(cM)}


#now do the comparison
sense.comp <- sense.m[,a1] - sense.m[,a2]  #Hk -Lk
antisense.comp <- antisense.m[,a1] - antisense.m[,a2] #Hk - Lk

smooth.val <- 0.25 #for the loess smooth

#do sense
smooth.dif <- list()
for (i in 1:5){
    bulk1 <- cbind(sense.comp[mark.coord$Chr==i], mark.coord[mark.coord$Chr==i,])
    smooth.dif[[i]] <- loess((bulk1[,1]) ~ bulk1$bp ,span = smooth.val)$fitted
	}#close chrom loop

#now do antisense
smooth.dif.anti <- list()
for (i in 1:5){
    bulk1 <- cbind(antisense.comp[mark.coord$Chr==i], mark.coord[mark.coord$Chr==i,])
    smooth.dif.anti[[i]] <- loess((bulk1[,1]) ~ bulk1$bp ,span = smooth.val)$fitted
	}#close chrom loop



################################################
#Now do the control experiment with the probes that aren't different
num.mark <- nrow(markers) # get the number of markers, use the same # for the controls
con.markers <- colvan[colvan[,1] == colvan[,2],]
con.markers <-con.markers[sample(1:nrow(con.markers), nrow(markers)),]
con.sense.m <- sense[sense.coord %in% rownames(con.markers),]
con.antisense.m <- antisense[sense.coord %in% rownames(con.markers),]
con.mark.coord <- as.data.frame(sense.mat[sense.coord %in% rownames(con.markers),c(1:2)], stringsAsFactors = FALSE)
con.mark.coord[,1] <- as.numeric(con.mark.coord[,1])
con.mark.coord[,2] <- as.numeric(con.mark.coord[,2])
colnames(con.mark.coord) <- c("Chr","bp")
if(plotcM){chr.len <- round(c(96.6, 83.3, 103.9, 64.5, 97.7)) #kosambi map distances
# assume equal recombination rate across chromosome
# assign cM from physical position
chr.bp <- tapply(con.mark.coord$bp,con.mark.coord$Chr,max)
cM.con <- list()
for (i in 1:5) cM.con[[i]] <- con.mark.coord$bp[con.mark.coord$Chr == i]*chr.len[i]/chr.bp[i]
con.mark.coord$cM <- unlist(cM.con)}


#now do the comparison
con.sense.comp <- con.sense.m[,a1] - con.sense.m[,a2]  #HP -LP
con.antisense.comp <- con.antisense.m[,a1] - con.antisense.m[,a2] #HP - LP

smooth.val <- 0.25 # for the loess smooth

# do sense
con.smooth.dif <- list()
for (i in 1:5){
    bulk1 <- cbind(con.sense.comp[con.mark.coord$Chr==i], con.mark.coord[con.mark.coord$Chr==i,])
    con.smooth.dif[[i]] <- loess((bulk1[,1]) ~ bulk1$bp ,span = smooth.val)$fitted
	}# close chrom loop


#now do antisense
con.smooth.dif.anti <- list()
for (i in 1:5){
    bulk1 <- cbind(con.antisense.comp[con.mark.coord$Chr==i], con.mark.coord[con.mark.coord$Chr==i,])
    con.smooth.dif.anti[[i]] <- loess((bulk1[,1]) ~ bulk1$bp ,span = smooth.val)$fitted
	}# close chrom loop


### make the spreadsheet to save the sense data:
traceout.1 <- data.frame("chr1",mark.coord$bp[mark.coord$Chr==1]/1e6, mark.coord$bp[mark.coord$Chr==1]/1e6,smooth.dif[1])
traceout.2 <- data.frame("chr2",(30432385+mark.coord$bp[mark.coord$Chr==2])/1e6, mark.coord$bp[mark.coord$Chr==2]/1e6,smooth.dif[2])
traceout.3 <- data.frame("chr3",(50136912+mark.coord$bp[mark.coord$Chr==3])/1e6, mark.coord$bp[mark.coord$Chr==3]/1e6,smooth.dif[3])
traceout.4 <- data.frame("chr4",(73607619+mark.coord$bp[mark.coord$Chr==4])/1e6, mark.coord$bp[mark.coord$Chr==4]/1e6,smooth.dif[4])
traceout.5 <- data.frame("chr5",(92192531+mark.coord$bp[mark.coord$Chr==5])/1e6, mark.coord$bp[mark.coord$Chr==5]/1e6,smooth.dif[5])
colnames(traceout.1) <- c("chromosome","tot_bp","bp","diff")
colnames(traceout.2) <- c("chromosome","tot_bp","bp","diff")
colnames(traceout.3) <- c("chromosome","tot_bp","bp","diff")
colnames(traceout.4) <- c("chromosome","tot_bp","bp","diff")
colnames(traceout.5) <- c("chromosome","tot_bp","bp","diff")
final.table <- rbind(traceout.1,traceout.2,traceout.3,traceout.4,traceout.5)

### add cM if they're interested.
if(plotcM){final.table$cM <- unlist(cM)}


### save the spreadsheet
t.file <- paste(file.stem, "snpprobemapping.csv", sep = "_")
write.table(final.table,file = t.file, sep = ",", row.names=F)

### now get the graph limits
g.max <- max(c(unlist(smooth.dif.anti),unlist(smooth.dif),unlist(con.smooth.dif.anti),unlist(con.smooth.dif)))
g.min <- min(c(unlist(smooth.dif.anti),unlist(smooth.dif),unlist(con.smooth.dif.anti),unlist(con.smooth.dif)))

### prepare to save the graphs in a pdf
fn <- paste(file.stem, "snpprobemapping.pdf", sep = "_")
pdf(fn, width = 11, height = 8.5)
par(mfrow=c(1,5))


### choose whether to plot by cM or Mb, then make the plots
if(plotcM){
for (i in 1:5){ 
	skim=seq.int(1, nrow(mark.coord[mark.coord$Chr==i,]),4)
	plot((mark.coord$cM[mark.coord$Chr==i])[skim],(smooth.dif[[i]])[skim],type="l",ylim=c(g.min, g.max),
		ylab = "allele frequency difference", xlab = paste("cM chromosome",i), main = file.stem)
		lines((mark.coord$cM[mark.coord$Chr==i])[skim],(smooth.dif.anti[[i]])[skim], col = 'blue')
		lines((con.mark.coord$cM[con.mark.coord$Chr==i])[skim],(con.smooth.dif.anti[[i]])[skim], col = 'yellow')
		lines((con.mark.coord$cM[con.mark.coord$Chr==i])[skim],(con.smooth.dif[[i]])[skim], col = 'brown')
		
                    abline(h=0)
				}#close chrom loop
}else{


for (i in 1:5){ 
	skim=seq.int(1, nrow(mark.coord[mark.coord$Chr==i,]),4)
	plot((mark.coord$bp[mark.coord$Chr==i]/1e6)[skim],(smooth.dif[[i]])[skim],type="l",ylim=c(g.min, g.max),
		ylab = "allele frequency difference", xlab = paste("Mb chromosome",i), main = file.stem)
		lines((mark.coord$bp[mark.coord$Chr==i]/1e6)[skim],(smooth.dif.anti[[i]])[skim], col = 'blue')
		lines((con.mark.coord$bp[con.mark.coord$Chr==i]/1e6)[skim],(con.smooth.dif.anti[[i]])[skim], col = 'yellow')
		lines((con.mark.coord$bp[con.mark.coord$Chr==i]/1e6)[skim],(con.smooth.dif[[i]])[skim], col = 'brown')
		
                    abline(h=0)

				}#close chrom loop
}#close if/else
dev.off() #shut that pdf



} #end of function loop
#########################################################################################















#########################################################################################
# This section defines the BSAsnp function that runs the user-driven process of BSA
# using SNP chips.




BSAsnp <- function(){
	
# find which OS-dependent readcel function to use.
os.type= readline("What type of OS are you using? \nEnter 1 for PC, Enter 2 for Mac, Linux, or Unix: ")

# choose which readcel function to use, with default as Mac/Unix/Linux:
if (as.numeric(os.type)==1){

#set the memory limit higher
memory.limit(size = 3095)	
	
	
	### readcel.R for spatial correction
library(affy)
library(stats)
readcel <- function(cel.files, probe.number, array.size, filter.size ){ 

mprobe.mean <- matrix(NA, nr=probe.number, nc=length(cel.files) )
for (i in 1:length(cel.files) )
{
probe.mean <- log(intensity(read.affybatch(filenames=cel.files [i] ) ) )
mprobe.mean[,i] <- matrix(probe.mean, nc=array.size, byrow=T) [array.size:1,] [probe.ok] 
}

# calculate median probe within array
meds <- apply(mprobe.mean,2,median)
mprobe.resid <- t( t(mprobe.mean) - meds)
filter.size <- 2*trunc(filter.size /2)+1  # force to be odd so the edge will be same length = filter.size/2
fe <- (filter.size-1)/2  # filter edge
mprobe.bg <- mprobe.resid

# do spatial correction
for (i in 1:length(cel.files))
{
    sp.mat <- matrix(0,nr= array.size,nc=array.size)
    sp.mat[probe.ok] <- mprobe.resid[,i]
    sp.mat[!probe.ok] <- median(mprobe.resid[,i])

    sp.mat.sum <- apply(sp.mat,2,filter,rep(1,filter.size))  # from start to end, every 51 probes sum in moving 
    sp.mat.sum <- apply(sp.mat.sum,1,filter,rep(1,filter.size))
    sp.mat.count <- apply(probe.ok,2,filter,rep(1,filter.size)) 
    sp.mat.count <- apply(sp.mat.count,1,filter,rep(1,filter.size))  #how many probes have been added up to contribute the sp.mat.sum
    spatial.corrected <- t(sp.mat.sum/sp.mat.count)

    # fill in edges, with the background of edge boarder
    spatial.corrected[1:fe,] <- rep(spatial.corrected[fe+1, ], each = fe)
    spatial.corrected[(array.size+1 - fe):array.size, ] <-
                        rep(spatial.corrected[array.size - fe, ], each = fe)
    spatial.corrected[,1:(fe)] <- spatial.corrected[,fe+1]
    spatial.corrected[,(array.size+1 - fe):array.size] <-
                                      spatial.corrected[,array.size - fe]
    
    mprobe.bg[,i] <- spatial.corrected[probe.ok]

    #bitmap( paste("spatialResid", cel.files[i], ".png", sep="" ) )
    #image(1:array.size, 1:array.size, spatial.corrected, 
    #              main = paste(cel.files[i], "spatialResid", filter.size,sep="") )
    #dev.off()

}

 mprobe.mean <- (mprobe.mean - mprobe.bg)[chrom.order,]
 return (mprobe.mean)

}

	
	
	
	
	
	}else{
		
### readcel.R for spatial correction on Macs and everything other than PCs
library(affy)
library(stats)
readcel <- function(cel.files, probe.number, array.size, filter.size ){ 

mprobe.mean <- matrix(NA, nr=probe.number, nc=length(cel.files) )
for (i in 1:length(cel.files) )
{
probe.mean <- log(intensity(read.affybatch(filenames=cel.files [i] ) ) )
mprobe.mean[,i] <- matrix(probe.mean, nc=array.size, byrow=T) [array.size:1,] [probe.ok] 
}

# calculate median probe within array
meds <- apply(mprobe.mean,2,median)
mprobe.resid <- t( t(mprobe.mean) - meds)
filter.size <- 2*trunc(filter.size /2)+1  # force to be odd so the edge will be same length = filter.size/2
fe <- (filter.size-1)/2  # filter edge
mprobe.bg <- mprobe.resid

# do spatial correction
for (i in 1:length(cel.files))
{
    sp.mat <- matrix(0,nr= array.size,nc=array.size)
    sp.mat[probe.ok] <- mprobe.resid[,i]
    sp.mat[!probe.ok] <- median(mprobe.resid[,i])

    sp.mat.sum <- apply(sp.mat,2,filter,rep(1,filter.size))  # from start to end, every 51 probes sum in moving 
    sp.mat.sum <- apply(sp.mat.sum,1,filter,rep(1,filter.size))
    sp.mat.count <- apply(probe.ok,2,filter,rep(1,filter.size)) 
    sp.mat.count <- apply(sp.mat.count,1,filter,rep(1,filter.size))  #how many probes have been added up to contribute the sp.mat.sum
    spatial.corrected <- t(sp.mat.sum/sp.mat.count)

    # fill in edges, with the background of edge boarder
    spatial.corrected[1:fe,] <- rep(spatial.corrected[fe+1, ], each = fe)
    spatial.corrected[(array.size+1 - fe):array.size, ] <-
                        rep(spatial.corrected[array.size - fe, ], each = fe)
    spatial.corrected[,1:(fe)] <- spatial.corrected[,fe+1]
    spatial.corrected[,(array.size+1 - fe):array.size] <-
                                      spatial.corrected[,array.size - fe]
    
    mprobe.bg[,i] <- spatial.corrected[probe.ok]

    #bitmap( paste("spatialResid", cel.files[i], ".png", sep="" ) )
    #image(1:array.size, 1:array.size, spatial.corrected, 
     #             main = paste(cel.files[i], "spatialResid", filter.size,sep="") )
    #dev.off()

}#close reading loop
mprobe.mean <- (mprobe.mean - mprobe.bg)[chrom.order,]
return (mprobe.mean)

}#close readcel function
}#close if/else





#set your working directory to the supplementary directory
supp.dir=readline("Enter the path to your supplementary directory: ")
cat("Please wait while supplementary data loads.") #be polite.
setwd(supp.dir)

#load the supplementary files
load("snp.RData", .GlobalEnv) #annotation for SNP probes from Affymetrix
load("snp.calls1020.RData", .GlobalEnv) #polymorphism data from Nordborg Lab
# new.snp.coord has to be in the Global Environment for map.graph to work
assign("new.snp.coord", paste("X",new.snp.data[,1],"_",new.snp.data[,2], sep = ""), envir=.GlobalEnv)
cat("\nLoad complete.")


#set your working directory to the files directory
files.dir= readline("Enter the path to the directory containing your .CEL files: ")
setwd(files.dir)
cel.files <- list.celfiles() # find files

# in case they gave the wrong directory, ask for a new one:
while(length(cel.files)==0){
	files.dir= readline("There are no .CEL files in that directory. Enter another directory: ")
	setwd(files.dir)
	cel.files <- list.celfiles()
	}
	
# find out what arrays to use:
cat("\n", "Available .CEL files:", "\n")
for (i in 1:length(cel.files)){cat(i,"-", cel.files[i],"\n")}
a1= as.numeric(readline("Enter the number shown above corresponding to the first array to be compared: "))
a2= as.numeric(readline("Enter the number shown above corresponding to the second array to be compared: "))

cat("Find the ecotype ID # for the outcross parent from this site: \nhttp://arabidopsis.usc.edu/Accession/#By%20Name\n\n")
cat("Here are a few common ID numbers for convenience:\n", 
"Bil-7      id 6901\nCAM-16     id   23\nCol-0      id 6909\nCS28181    id 6744\nCvi-0      id 6911\nEden-1     id 6009\nHod        id 8235\nKr-0       id 7201   (aka CS28419)\nLer-1      id 6932\nVan-0      id 6977\n", sep="")


# find the column in new.snp.data that corresponds to the outcross parent.
eco.id.num = readline("Enter the ecotype ID # for the outcross parent: ")
p2=which(colnames(new.snp.data) == paste("X", eco.id.num, sep=""))

# in case they gave the wrong ecotype ID number
while(length(p2)==0){
	eco.id.num= readline("That ecotype ID number can't be found in our polymorphism data.\nTry an alternate ID #:")
	p2=which(colnames(new.snp.data) == paste("X", eco.id.num, sep=""))}


#get the cM/Mb decision
cM.decider= readline("How should the difference in allele frequency be plotted?
Enter 1 for vs. centimorgans, Enter 2 for vs. mega base pairs: ")
if (as.numeric(cM.decider)!=1){plotcMtag=F}else{plotcMtag=T} #have the default be centimorgans



#get the file.stem
file.stem= readline("Enter the name to be given to the output files: ")


cat("Please wait while analysis is performed.")#be polite

#read/correct .CEL files
library(affy)
ord <- rep( c(1,2,3,4), nrow(snp)/4 ) # set up the matrix for readcel
snp <- cbind( snp, ord)
array.size <- 1612
probe.ok <- matrix(FALSE, nr=array.size, nc=array.size)
probe.ok.chrom <- matrix(NA, nr=array.size, nc=array.size)
probe.ok.position <- matrix(NA, nr=array.size, nc=array.size)
probe.ok.ord <- matrix(NA, nr=array.size, nc=array.size)
probe.ok[cbind(snp$xpos+1, snp$ypos+1)] <- TRUE
probe.ok.chrom[cbind(snp$xpos + 1, snp$ypos+1)] <- as.numeric( as.character(snp$chr) )
probe.ok.position [cbind(snp$xpos + 1, snp$ypos+1)] <- as.numeric( as.character( snp$bp) )
probe.ok.ord [cbind(snp$xpos + 1, snp$ypos+1)] <- as.numeric( as.character(snp$ord) )
probe.ok.chrom.valid <- probe.ok.chrom[probe.ok]
probe.ok.position.valid <- probe.ok.position [probe.ok]
probe.ok.ord.valid <- probe.ok.ord [probe.ok]
chrom.order <- order(probe.ok.chrom.valid, probe.ok.position.valid, probe.ok.ord.valid)
# only read in the files you want to analyze- note the indexing vector
mprobe.mean <- readcel(cel.files=cel.files, probe.number=nrow(snp), array.size=1612, filter.size=81)
colnames (mprobe.mean) <- sub(".CEL", "", cel.files)#name rows
rownames (mprobe.mean) <- as.character(snp$snpid)#name cols


#sort & subtract signals
#sense and antisense have to be in the Global Environment for map.graph to work.
assign("antisense", mprobe.mean[seq(1, nrow(snp), 4),]-mprobe.mean[seq(3, nrow(snp),4),], envir=.GlobalEnv)
assign("sense", mprobe.mean[seq(2, nrow(snp),4),]-mprobe.mean[seq(4, nrow(snp),4),], envir=.GlobalEnv)


p1 <-552 		# the column in "new.snp.data" that corresponds to the ref strain (i.e. Col-0)
map.graph(p1,p2,a1,a2,file.stem, plotcM=plotcMtag) #execute the function, note we choose arrays 1 and 2 b/c of use of the indexing vector above


#let them know you're done.
cat("\nAnalysis complete. Look in your .CEL files directory for results.\n\n")


	
	}#end function loop