#Assembled by R.Machiraju # From Text Book by Sorin Dragichi # Chapter 7 # We will walk thorugh the processing of microarray data as described in the paper # by Golub et al. # # Ready to roll #Set it up so that you can download stuff source("http://www.bioconductor.org/biocLite.R") biocLite("golubEsets") # attach golubESets package require(golubEsets) # bring Object Golub_Merge into scope data(Golub_Merge) # just mention and display what is in the object Golub_Merge #experiment-level metaData experimentData(Golub_Merge) #Expression set and objectgets #Extract abstract - first 1000 characters substr(abstract(Golub_Merge), 1,1000) # numerical and categorical data about the microarray data and samples to which # it was applied dim(exprs(Golub_Merge)) dim(pData(Golub_Merge)) # how are assay reporters named ? featureNames(Golub_Merge)[1001:1010] # how are samples named ? sampleNames(Golub_Merge)[1:10] #help("ExpressionSet-class"); # Functions and memberships are facilitated by special containers called #ExpressionSet getClass("ExpressionSet") # Primary Outcome of study table(Golub_Merge$ALL.AML) #Annotation annotation(Golub_Merge) # Use hu6800 representation biocLite("hu6800.db") library("hu6800.db") #constituents objects("package:hu6800.db") ls(hu6800SYMBOL)[1:10] #keys of HUGO mapping ls(hu6800SYMBOL)[1:1] #retrieve actual mappings mget(ls(hu6800SYMBOL)[1:1],hu6800SYMBOL) #SQL (SQ lite) for annotation packaging - connecting probe identifiers to tokens such as HUGO, #chromosomal location and gene coordinates, gene ontology (GO) term tags, #KEGG pathway tags, OMIM concepts, PubMed reference tags # SQLite is used instead of hastables hh=org.Hs.eg_dbconn() #List tables for NCBI Entrez Gene metadata dbListTables(hh) # # Find probeset identifiers for genes annotated as sarcoma oncogenes. #Use partial mathcing query # SQL = paste( "select * from gene_info inner join genes using (_id)", " where gene_name like '%sarcoma%oncogene%'") sops = dbGetQuery(hh,SQL) #how many ? dim(sops) # What are they ? sops$symbol #Gene LYN -get it sops[13,] get("4067",revmap(hu6800ENTREZID)) # Visualize experssion of LYN in both ALL and AML boxplot(exprs(Golub_Merge)["M16038_at",]~Golub_Merge$ALL.AML,col=c("darkgreen","yellow")) #Gene YES1 sops[19,] get("6194",revmap(hu6800ENTREZID)) boxplot(exprs(Golub_Merge)["M77232_rna1_at",]~Golub_Merge$ALL.AML,col=c("darkgreen","yellow")) #PREDICTIVE MODELING #Sort of works source("http://www.bioconductor.org/biocLite.R") biocLite("MLInterfaces") library("MLInterfaces") #https://www.bioconductor.org/packages/release/bioc/manuals/genefilter/man/genefilter.pdf # Make sure #install.packages("randomForest") #install.packages("genefilter") # Filter based on variance. Keep the highest variance ones. Top 10%. GMF <- nsFilter(Golub_Merge,var.cutoff=.9)[[1]] #Different from book. We are using cross-validation. Also ask that importance measures # be computed. # More details in Chapter 29. rf1 <- MLearn(ALL.AML~., data=GMF, randomForestI, 1:40, importance=TRUE, sampsize=table(GMF$ALL.AML[1:40]), mtry=sqrt(ncol(exprs(GMF)))) rf1 # Notice the confusion matrix RObject(rf1) # Importance of genes for prediction savpar = par(no.readonly=TRUE) par(las=2,mar=c(6,9,6,6)) plot(getVarImp(rf1,FALSE),plat="hu6800",toktype="SYMBOL",col="gold") par(savpar) #end