# lab1_example.R
# CSE5599-BMI7830 Example 
# Goal - Differential expression of genes as manifest in microarrays 
# for patients afflicted with breast cancer. Specifically, the data 
# includes gene expression for 47  human breast tumor cases; 
# (* normalized by GCRMA for global expression analysis)
# Scholarly Manuscript - 
# Richardson AL, Wang ZC, De Nicolo A, Lu X et al. 
# X chromosomal abnormalities in basal-like human breast cancer. 
# Cancer Cell 2006 Feb;9(2):121-32. PMID: 16473279
# Script found and adapted by Prof. K. Huang.

source("http://bioconductor.org/biocLite.R")
biocLite("RColorBrewer")
biocLite("limma")
biocLite("gcrma")
biocLite("affy")
biocLite("Biobase")
biocLite("affyPLM")
biocLite("affydata")
biocLite("matchprobes")
biocLite("GEOquery")
biocLite("simpleaffy")

library(GEOquery)
getGEOSuppFiles("GSE3744")
untar("GSE3744/GSE3744_RAW.tar", exdir="data")
cels <- list.files("data/", pattern = "[gz]")
sapply(paste("data", cels, sep="/"), gunzip)
cels

library(simpleaffy)
celfiles <- read.affy(covdesc="GDS2250_SampleAnnotation.txt", path="data")
celfiles
celfiles.gcrma <- gcrma(celfiles)
celfiles.gcrma

# load colour libraries
library(RColorBrewer)
# set colour palette
cols <- brewer.pal(8, "Set1")
# plot a boxplot of unnormalised intensity values
boxplot(celfiles, col=cols)
# plot a boxplot of normalised intensity values, affyPLM is required to interrogate celfiles.gcrma
library(affyPLM)
boxplot(celfiles.gcrma, col=cols)
# the boxplots are somewhat skewed by the normalisation algorithm
# and it is often more informative to look at density plots
# Plot a density vs log intensity histogram for the unnormalised data
hist(celfiles, col=cols)
# Plot a density vs log intensity histogram for the normalised data
hist(celfiles.gcrma, col=cols)

# Perform probe-level metric calculations on the CEL files:
celfiles.qc <- fitPLM(celfiles)
# Create an image of the first .CEL:
image(celfiles.qc, which=1, add.legend=TRUE)
# Create an image of the 4th.CEL
# There is a spatial artifact present
image(celfiles.qc, which=4, add.legend=TRUE)
# affyPLM also provides more informative boxplots
# RLE (Relative Log Expression) plots should have
# values close to zero. GSM524665.CEL is an outlier
RLE(celfiles.qc, main="RLE")
# We can also use NUSE (Normalised Unscaled Standard Errors).
# The median standard error should be 1 for most genes.
# GSM524665.CEL appears to be an outlier on this plot too
NUSE(celfiles.qc, main="NUSE")

celfiles.filtered <- nsFilter(celfiles.gcrma, require.entrez=FALSE, remove.dupEntrez=FALSE)
# What got removed and why?
celfiles.filtered$filter.log

samples <- celfiles.gcrma$Factors
# check the results of this
samples
# convert into factors
samples <- as.factor(samples)
# check factors have been assigned
samples
# set up the experimental design
design <- model.matrix(~0 + samples)
colnames(design) <- c("non_basal_like_cancer", "BRCA1_associated_cancer", "basal_like_cancer", "normal")
# inspect the experiment design
design

library(limma)
# fit the linear model to the filtered expression set
fit <- lmFit(exprs(celfiles.filtered$eset), design)
# set up a contrast matrix to compare tissues v cell line
contrast.matrix <- makeContrasts(basal_nonbasal = basal_like_cancer - non_basal_like_cancer, basal_normal = basal_like_cancer - normal, levels=design)
# check the contrast matrix
contrast.matrix
# Now the contrast matrix is combined with the per-probeset linear model fit.
basal_fits <- contrasts.fit(fit, contrast.matrix)
basal_ebFit <- eBayes(basal_fits)
# return the top 10 results for any given contrast
# coef=1 is basal vs nonbasal, coef=2 is basal vs normal
topTable(basal_ebFit, number=10, coef=1)
nrow(topTable(basal_ebFit, coef=1, number=10000, lfc=5))

topTable(basal_ebFit, number=10, coef=2)
nrow(topTable(basal_ebFit, coef=2, number=10000, lfc=5))

probeset.list <- topTable(basal_ebFit, coef=1, number=10000, lfc=4)

biocLite("hgu133plus2.db")
library(hgu133plus2.db)
library(annotate)
gene.symbols <- getSYMBOL(row.names(probeset.list), "hgu133plus2")
results <- cbind(probeset.list, gene.symbols)
head(results)

gene_list <- topTable(basal_ebFit, coef=1, number=1000000, sort.by="logFC")
gene.symbols <- getSYMBOL(row.names(gene_list), "hgu133plus2")
gene_list <- cbind(gene_list, gene.symbols)

head(gene_list$logFC)
head(gene_list$P.Value)

##The par command sets "nice" graphical defaults
par(mar=c(3,3,2,1), mgp=c(2,.7,0), tck=-.01)
plot(gene_list$logFC, -log10(gene_list$P.Value),
     xlim=c(-10, 10), ylim=c(0, 15), #Set limits
     xlab="log2 fold change", ylab="-log10 p-value")#Set axis labels


sum(abs(gene_list$logFC) > 2 & gene_list$P.Value < 0.001)

##no_of_genes=27,306
no_of_genes = dim(probeset.list)[1]
##Gives 693 genes
sum(abs(gene_list$logFC) > 2 & gene_list$P.Value < 0.05/no_of_genes)

install.packages("ggplot2")
require(ggplot2)
##Highlight genes that have an absolute fold change > 2 and a p-value < Bonferroni cut-off
gene_list$threshold = as.factor(abs(gene_list$logFC) > 2 & gene_list$P.Value < 0.05/no_of_genes)

##Construct the plot object
g = ggplot(data=gene_list, aes(x=lproogFC, y=-log10(P.Value), colour=threshold)) + 
  geom_point(alpha=0.4, size=1.75) + theme(legend.position = "none") + xlim(c(-10, 10)) + ylim(c(0, 15)) + xlab("log2 fold change") + ylab("-log10 p-value")
g

ind <- which(gene_list$gene.symbols == 'ESR1')
##Graph not shown
##g + geom_text(aes(x=gene_list[ind[1],]$logFC, y=-log10(gene_list[ind[1],]$P.Value), label=gene_list[ind[1],]$gene.symbols, size=1.2), colour="black")
g + geom_text(aes(x=gene_list[ind,]$logFC, y=-log10(gene_list[ind,]$P.Value), label=gene_list[ind,]$gene.symbols, size=1.2), colour="black")