#Assembled by R.Machiraju
# From Text Book by Sorin Dragichi
# Chapter 7
# We will walk thorugh the processing of microarray data as described in the paper
# by Golub et al.
#

# Ready to roll 
#Set it up so that you can download stuff

source("http://www.bioconductor.org/biocLite.R")
biocLite("golubEsets")

# attach golubESets package
require(golubEsets)

# bring Object Golub_Merge into scope
data(Golub_Merge)

# just mention and display what is in the object
Golub_Merge

#experiment-level metaData
experimentData(Golub_Merge)

#Expression set and objectgets

#Extract abstract - first 1000 characters
substr(abstract(Golub_Merge), 1,1000)

# numerical and categorical data about the microarray data and samples to which
# it was applied
dim(exprs(Golub_Merge))

dim(pData(Golub_Merge))


# how are assay reporters named ?
featureNames(Golub_Merge)[1001:1010]

# how are samples named ?
sampleNames(Golub_Merge)[1:10]
#help("ExpressionSet-class");

# Functions and memberships are facilitated by special containers called 
#ExpressionSet

getClass("ExpressionSet")
# Primary Outcome of study
table(Golub_Merge$ALL.AML)

#Annotation
annotation(Golub_Merge)

# Use hu6800 representation
biocLite("hu6800.db")
library("hu6800.db")

#constituents

objects("package:hu6800.db")
ls(hu6800SYMBOL)[1:10]

#keys of HUGO mapping

ls(hu6800SYMBOL)[1:1]

#retrieve actual mappings

mget(ls(hu6800SYMBOL)[1:1],hu6800SYMBOL)

#SQL (SQ lite) for annotation packaging - connecting probe identifiers to tokens such as HUGO, 
#chromosomal location and gene coordinates, gene ontology (GO) term tags, 
#KEGG pathway tags, OMIM concepts, PubMed reference tags
# SQLite is used instead of hastables


hh=org.Hs.eg_dbconn()
#List tables for NCBI Entrez Gene metadata

dbListTables(hh)

#
# Find probeset identifiers for genes annotated as sarcoma oncogenes. 
#Use partial mathcing query
#
SQL = paste( "select * from gene_info inner join genes using (_id)", " where gene_name like '%sarcoma%oncogene%'")
sops = dbGetQuery(hh,SQL)

#how many ?
dim(sops)
# What are they ?
sops$symbol

#Gene LYN -get it
sops[13,]
get("4067",revmap(hu6800ENTREZID))

# Visualize experssion of LYN in both ALL and AML

boxplot(exprs(Golub_Merge)["M16038_at",]~Golub_Merge$ALL.AML,col=c("darkgreen","yellow"))

#Gene YES1
sops[19,]

get("6194",revmap(hu6800ENTREZID))
boxplot(exprs(Golub_Merge)["M77232_rna1_at",]~Golub_Merge$ALL.AML,col=c("darkgreen","yellow"))

#PREDICTIVE MODELING
#Sort of works

source("http://www.bioconductor.org/biocLite.R")
biocLite("MLInterfaces")
library("MLInterfaces")

#https://www.bioconductor.org/packages/release/bioc/manuals/genefilter/man/genefilter.pdf
# Make sure
#install.packages("randomForest")
#install.packages("genefilter")

# Filter based on variance. Keep the highest variance ones. Top 10%.

GMF <- nsFilter(Golub_Merge,var.cutoff=.9)[[1]]

#Different from book. We are using cross-validation. Also ask that importance measures
# be computed.
# More details in Chapter 29.

rf1 <- MLearn(ALL.AML~., data=GMF, randomForestI, 1:40, importance=TRUE, sampsize=table(GMF$ALL.AML[1:40]), mtry=sqrt(ncol(exprs(GMF))))
rf1

# Notice the confusion matrix

RObject(rf1)

# Importance of genes for prediction
savpar = par(no.readonly=TRUE)
par(las=2,mar=c(6,9,6,6))
plot(getVarImp(rf1,FALSE),plat="hu6800",toktype="SYMBOL",col="gold")
par(savpar)

#end