## source file for "Chromatin-enriched RNAs mark active and repressive cis-regulation: an analysis of nuclear RNA-seq"
## Author: Xiangying Sun
## Last update: Jun 2019


###########################################################################################
# 1. Genome Alignment using Tophat
###########################################################################################

#######	K562_CPE1 
tophat --rg-sample SRR3703288 --rg-id SRR3703288 --library-type=fr-firststrand \
--segment-length 50 --segment-mismatches 2 --no-coverage-search --keep-fasta-order -p 32 \
-o /homeDir/GSE83531_cheRNA/tophat/SRR3703288 \
/homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome \
/homeDir/GSE83531_cheRNA/FastQ/SRR3703288.fastq.gz

#######	K562_CPE2
tophat --rg-sample SRR3703289 --rg-id SRR3703289 --library-type=fr-firststrand \
--segment-length 50 --segment-mismatches 2 --no-coverage-search --keep-fasta-order -p 32 \
-o /homeDir/GSE83531_cheRNA/tophat/SRR3703289 \
/homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome \
/homeDir/GSE83531_cheRNA/FastQ/SRR3703289.fastq.gz

#######	K562_CPE3
tophat --rg-sample SRR3703290 --rg-id SRR3703290 --library-type=fr-firststrand \
--segment-length 50 --segment-mismatches 2 --no-coverage-search --keep-fasta-order -p 32 \
-o /homeDir/GSE83531_cheRNA/tophat/SRR3703290 \
/homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome \
/homeDir/GSE83531_cheRNA/FastQ/SRR3703290.fastq.gz

#######	K562_SNE1
tophat --rg-sample SRR3703291 --rg-id SRR3703291 --library-type=fr-firststrand \
--segment-length 50 --segment-mismatches 2 --no-coverage-search --keep-fasta-order -p 32 \
-o /homeDir/GSE83531_cheRNA/tophat/SRR3703291 \
/homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome \
/homeDir/GSE83531_cheRNA/FastQ/SRR3703291.fastq.gz

#######	K562_SNE2
tophat --rg-sample SRR3703292 --rg-id SRR3703292 --library-type=fr-firststrand \
--segment-length 50 --segment-mismatches 2 --no-coverage-search --keep-fasta-order -p 32 \
-o /homeDir/GSE83531_cheRNA/tophat/SRR3703292 \
/homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome \
/homeDir/GSE83531_cheRNA/FastQ/SRR3703292.fastq.gz

#######	K562_SNE3
tophat --rg-sample SRR3703293 --rg-id SRR3703293 --library-type=fr-firststrand \
--segment-length 50 --segment-mismatches 2 --no-coverage-search --keep-fasta-order -p 32 \
-o /homeDir/GSE83531_cheRNA/tophat/SRR3703293 \
/homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome \
/homeDir/GSE83531_cheRNA/FastQ/SRR3703293.fastq.gz


###########################################################################################
# 2. clean bam files using samtools to remove reads mapped mitochondria chromosome
###########################################################################################

#######	K562_CPE1 
samtools idxstats /homeDir/GSE83531_cheRNA/All_bam/SRR3703288_accepted_hits.bam \
| cut -f 1 | grep 'chr' | grep -v 'chrM' | \
xargs samtools view -b /homeDir/GSE83531_cheRNA/All_bam/SRR3703288_accepted_hits.bam > \
/homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703288_clean.bam
samtools index /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703288_clean.bam

#######	K562_CPE2
samtools idxstats /homeDir/GSE83531_cheRNA/All_bam/SRR3703289_accepted_hits.bam \
| cut -f 1 | grep 'chr' | grep -v 'chrM' | \
xargs samtools view -b /homeDir/GSE83531_cheRNA/All_bam/SRR3703289_accepted_hits.bam > \
/homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703289_clean.bam
samtools index /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703289_clean.bam

#######	K562_CPE3
samtools idxstats /homeDir/GSE83531_cheRNA/All_bam/SRR3703290_accepted_hits.bam \
| cut -f 1 | grep 'chr' | grep -v 'chrM' | \
xargs samtools view -b /homeDir/GSE83531_cheRNA/All_bam/SRR3703290_accepted_hits.bam > \
/homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703290_clean.bam
samtools index /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703290_clean.bam

#######	K562_SNE1
samtools idxstats /homeDir/GSE83531_cheRNA/All_bam/SRR3703291_accepted_hits.bam \
| cut -f 1 | grep 'chr' | grep -v 'chrM' | \
xargs samtools view -b /homeDir/GSE83531_cheRNA/All_bam/SRR3703291_accepted_hits.bam > \
/homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703291_clean.bam
samtools index /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703291_clean.bam

#######	K562_SNE2
samtools idxstats /homeDir/GSE83531_cheRNA/All_bam/SRR3703292_accepted_hits.bam \
| cut -f 1 | grep 'chr' | grep -v 'chrM' | \
xargs samtools view -b /homeDir/GSE83531_cheRNA/All_bam/SRR3703292_accepted_hits.bam > \
/homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703292_clean.bam
samtools index /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703292_clean.bam

#######	K562_SNE3
samtools idxstats /homeDir/GSE83531_cheRNA/All_bam/SRR3703293_accepted_hits.bam \
| cut -f 1 | grep 'chr' | grep -v 'chrM' | \
xargs samtools view -b /homeDir/GSE83531_cheRNA/All_bam/SRR3703293_accepted_hits.bam > \
/homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703293_clean.bam
samtools index /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703293_clean.bam


###########################################################################################
# 3. RABT assembly using Cufflinks
###########################################################################################

#######	K562_CPE1 
cufflinks -o /homeDir/GSE83531_cheRNA/cufflinks_rabt/SRR3703288 \
-p 32 --library-type fr-firststrand \
-g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \
-u /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703288_clean.bam

#######	K562_CPE2
cufflinks -o /homeDir/GSE83531_cheRNA/cufflinks_rabt/SRR3703289 \
-p 32 --library-type fr-firststrand \
-g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \
-u /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703280_clean.bam

#######	K562_CPE3
cufflinks -o /homeDir/GSE83531_cheRNA/cufflinks_rabt/SRR3703290 \
-p 32 --library-type fr-firststrand \
-g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \
-u /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703290_clean.bam

#######	K562_SNE1
cufflinks -o /homeDir/GSE83531_cheRNA/cufflinks_rabt/SRR3703291 \
-p 32 --library-type fr-firststrand \
-g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \
-u /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703291_clean.bam

#######	K562_SNE2
cufflinks -o /homeDir/GSE83531_cheRNA/cufflinks_rabt/SRR3703292 \
-p 32 --library-type fr-firststrand \
-g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \
-u /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703292_clean.bam

#######	K562_SNE3
cufflinks -o /homeDir/GSE83531_cheRNA/cufflinks_rabt/SRR3703293 \
-p 32 --library-type fr-firststrand \
-g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \
-u /homeDir/GSE83531_cheRNA/cleaned_bam/SRR3703293_clean.bam


###########################################################################################
# 4. Build transcriptome using Cuffmerge
###########################################################################################

cuffmerge -o /homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/ -p 32 \
-g /homeDir/Reference_Sequences/Human/Annotation/gencode.v25.primary_assembly.annotation.gtf \
-s /homeDir/Reference_Sequences/Human/FASTA/GRCh38.primary_assembly.genome.fa \
/homeDir/GSE83531_cheRNA/cufflinks_rabt/che_sne_K562.txt


##########################################################################
# 5. Get geneCounts using htseq
##########################################################################

#######	K562_CPE1 
htseq-count -f bam -s reverse -m intersection-nonempty \
/homeDir/GSE83531_cheRNA/cleaned_bam/K562_cleaned_bam/SRR3703288_clean.bam \
/homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/merged.gtf > \
/homeDir/GSE83531_cheRNA/HTseq_ruthenburg_1000_K562/SRR3703288_geneCounts.out

#######	K562_CPE2
htseq-count -f bam -s reverse -m intersection-nonempty \
/homeDir/GSE83531_cheRNA/cleaned_bam/K562_cleaned_bam/SRR3703289_clean.bam \
/homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/merged.gtf > \
/homeDir/GSE83531_cheRNA/HTseq_ruthenburg_1000_K562/SRR3703289_geneCounts.out

#######	K562_CPE3
htseq-count -f bam -s reverse -m intersection-nonempty \
/homeDir/GSE83531_cheRNA/cleaned_bam/K562_cleaned_bam/SRR3703290_clean.bam \
/homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/merged.gtf > \
/homeDir/GSE83531_cheRNA/HTseq_ruthenburg_1000_K562/SRR3703290_geneCounts.out

#######	K562_SNE1
htseq-count -f bam -s reverse -m intersection-nonempty \
/homeDir/GSE83531_cheRNA/cleaned_bam/K562_cleaned_bam/SRR3703291_clean.bam \
/homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/merged.gtf > \
/homeDir/GSE83531_cheRNA/HTseq_ruthenburg_1000_K562/SRR3703291_geneCounts.out

#######	K562_SNE2
htseq-count -f bam -s reverse -m intersection-nonempty \
/homeDir/GSE83531_cheRNA/cleaned_bam/K562_cleaned_bam/SRR3703292_clean.bam \
/homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/merged.gtf > \
/homeDir/GSE83531_cheRNA/HTseq_ruthenburg_1000_K562/SRR3703292_geneCounts.out

#######	K562_SNE3
htseq-count -f bam -s reverse -m intersection-nonempty \
/homeDir/GSE83531_cheRNA/cleaned_bam/K562_cleaned_bam/SRR3703293_clean.bam \
/homeDir/GSE83531_cheRNA/cuffmerge_rabt_k562/merged.gtf > \
/homeDir/GSE83531_cheRNA/HTseq_ruthenburg_1000_K562/SRR3703293_geneCounts.out


##########################################################################
# 6. Differential expression analysis using limma package in R
##########################################################################
###### Reference: https://www.bioconductor.org/help/workflows/RNAseq123/

library(limma)
library(edgeR)

###### read htseq geneCounts into a DGE matrix used in edgeR
htseqToDGE <- function(files_dir){
  library(edgeR)
  files <- list.files(path=files_dir, pattern="geneCounts")
  files <- sort(files, decreasing=T)
  files_fullPath <- sapply(files, function(x) paste0(files_dir,x))
  #########
  raw_DGE <- readDGE(files_fullPath, header=F)
  list <- c("no_feature", "ambiguous", "too_low_aQual", "not_aligned", "alignment_not_unique")
  for(i in list){
    tmp <- grep(i, rownames(raw_DGE$counts))
    if(length(tmp)>0) raw_DGE$counts <- raw_DGE$counts[-tmp,]
  }
  colnames(raw_DGE) <- sapply(files, function(x) unlist(strsplit(x, "_", fixed=T))[1])
  group <- as.factor(c("SNE", "SNE", "SNE", "CPE", "CPE", "CPE"))
  raw_DGE$samples$group <- group
  return(raw_DGE)
}

###### filtered raw counts, normalize raw counts and make density plots and barplots
filterNormPlot <- function(raw_DGE, method, cpm_cutoff){
  library(edgeR)
  library(RColorBrewer)
  
  # plot log-cpm of raw data
  par(mfrow=c(2,2))
  nsamples <- ncol(raw_DGE)
  col <- brewer.pal(nsamples, "RdYlGn")
  samplenames <- colnames(tuxedo_DGE)
  lcpm <- cpm(raw_DGE, log=TRUE)
  plot(density(lcpm[,1]), col=col[1], lwd=2, las=2, ylim=c(0,0.5), 
       main="", xlab="")
  title(main=paste0("A. ",method," Raw data"), xlab="Log2-cpm")
  abline(v=0, lty=3)
  for (i in 2:nsamples){
    den <- density(lcpm[,i])
    lines(den$x, den$y, col=col[i], lwd=2)
  }
  legend("topright", samplenames, text.col=col, bty="n")
  
  # filter by cpm
  cpm <- cpm(raw_DGE)
  keep.exprs <- rowSums(cpm>cpm_cutoff)>=3  
  filtered_DGE <- raw_DGE[keep.exprs, keep.lib.sizes=FALSE]
  dim(filtered_DGE)
  filtered_lcpm <- cpm(filtered_DGE, log=TRUE)
  plot(density(filtered_lcpm[,1]), col=col[1], lwd=2, las=2, ylim=c(0,0.5),
       main="", xlab="")
  title(main=paste0("B. ", method, " Filtered data"), xlab="Log2-cpm")
  abline(v=0, lty=3)
  for (i in 2:nsamples){
    den <- density(filtered_lcpm[,i])
    lines(den$x, den$y, col=col[i], lwd=2)
  }
  legend("topright", samplenames, text.col=col, bty="n")
  
  # plot unnormalized data
  boxplot(filtered_lcpm, las=2, col=col, main="")
  title(main=paste0("C. Example: ",method," Unnormalized filtered data"),ylab="Log2-cpm")
  
  # normalization
  norm_DGE <- calcNormFactors(filtered_DGE, method = "TMM")
  norm_DGE$samples$norm.factors
  norm_lcpm <- cpm(norm_DGE, log=TRUE)
  boxplot(norm_lcpm, las=2, col=col, main="")
  title(main=paste0("D. Example: ",method," Normalised filtered data"),ylab="Log2-cpm")
  return(norm_DGE)
}

limmaFitDE <- function(norm_DGE, method){
  library(limma)
  group <- norm_DGE$samples$group
  design <- model.matrix(~0+group)
  contr.matrix <- makeContrasts(CPEvsSNE = groupCPE-groupSNE, levels = colnames(design))
  
  par(mfrow=c(1,2))
  v <- voom(norm_DGE, design, plot=TRUE)
  vfit <- lmFit(v, design)
  vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
  efit <- eBayes(vfit)
  plotSA(efit, main=paste0(method, " Final model: Mean???variance trend"))
  return(efit)
}


tuxedo_path <- "HTseq_tuxedo/"
tuxedo_DGE <- htseqToDGE(tuxedo_path)

pdf(file="limma_filter_norm_plot_2.pdf", width=10, height=8)
tuxedo_normDGE <- filterNormPlot(tuxedo_DGE, "Tuxedo", 1)
dev.off()

tuxedo_efit <- limmaFitDE(tuxedo_normDGE, "Tuxedo")

summary(decideTests(tuxedo_efit, p.value = 0.05, lfc=log2(1.2)))

tuxedo_limmaRES <- topTreat(tuxedo_efit, sort="none",number=Inf)
tuxedo_limmaRES$decision <- decideTests(tuxedo_efit, p.value = 0.05, lfc=log2(1.2))



