# lab1_example.R # CSE5599-BMI7830 Example # Goal - Differential expression of genes as manifest in microarrays # for patients afflicted with breast cancer. Specifically, the data # includes gene expression for 47 human breast tumor cases; # (* normalized by GCRMA for global expression analysis) # Scholarly Manuscript - # Richardson AL, Wang ZC, De Nicolo A, Lu X et al. # X chromosomal abnormalities in basal-like human breast cancer. # Cancer Cell 2006 Feb;9(2):121-32. PMID: 16473279 # Script found and adapted by Prof. K. Huang. source("http://bioconductor.org/biocLite.R") biocLite("RColorBrewer") biocLite("limma") biocLite("gcrma") biocLite("affy") biocLite("Biobase") biocLite("affyPLM") biocLite("affydata") biocLite("matchprobes") biocLite("GEOquery") biocLite("simpleaffy") library(GEOquery) getGEOSuppFiles("GSE3744") untar("GSE3744/GSE3744_RAW.tar", exdir="data") cels <- list.files("data/", pattern = "[gz]") sapply(paste("data", cels, sep="/"), gunzip) cels library(simpleaffy) celfiles <- read.affy(covdesc="GDS2250_SampleAnnotation.txt", path="data") celfiles celfiles.gcrma <- gcrma(celfiles) celfiles.gcrma # load colour libraries library(RColorBrewer) # set colour palette cols <- brewer.pal(8, "Set1") # plot a boxplot of unnormalised intensity values boxplot(celfiles, col=cols) # plot a boxplot of normalised intensity values, affyPLM is required to interrogate celfiles.gcrma library(affyPLM) boxplot(celfiles.gcrma, col=cols) # the boxplots are somewhat skewed by the normalisation algorithm # and it is often more informative to look at density plots # Plot a density vs log intensity histogram for the unnormalised data hist(celfiles, col=cols) # Plot a density vs log intensity histogram for the normalised data hist(celfiles.gcrma, col=cols) # Perform probe-level metric calculations on the CEL files: celfiles.qc <- fitPLM(celfiles) # Create an image of the first .CEL: image(celfiles.qc, which=1, add.legend=TRUE) # Create an image of the 4th.CEL # There is a spatial artifact present image(celfiles.qc, which=4, add.legend=TRUE) # affyPLM also provides more informative boxplots # RLE (Relative Log Expression) plots should have # values close to zero. GSM524665.CEL is an outlier RLE(celfiles.qc, main="RLE") # We can also use NUSE (Normalised Unscaled Standard Errors). # The median standard error should be 1 for most genes. # GSM524665.CEL appears to be an outlier on this plot too NUSE(celfiles.qc, main="NUSE") celfiles.filtered <- nsFilter(celfiles.gcrma, require.entrez=FALSE, remove.dupEntrez=FALSE) # What got removed and why? celfiles.filtered$filter.log samples <- celfiles.gcrma$Factors # check the results of this samples # convert into factors samples <- as.factor(samples) # check factors have been assigned samples # set up the experimental design design <- model.matrix(~0 + samples) colnames(design) <- c("non_basal_like_cancer", "BRCA1_associated_cancer", "basal_like_cancer", "normal") # inspect the experiment design design library(limma) # fit the linear model to the filtered expression set fit <- lmFit(exprs(celfiles.filtered$eset), design) # set up a contrast matrix to compare tissues v cell line contrast.matrix <- makeContrasts(basal_nonbasal = basal_like_cancer - non_basal_like_cancer, basal_normal = basal_like_cancer - normal, levels=design) # check the contrast matrix contrast.matrix # Now the contrast matrix is combined with the per-probeset linear model fit. basal_fits <- contrasts.fit(fit, contrast.matrix) basal_ebFit <- eBayes(basal_fits) # return the top 10 results for any given contrast # coef=1 is basal vs nonbasal, coef=2 is basal vs normal topTable(basal_ebFit, number=10, coef=1) nrow(topTable(basal_ebFit, coef=1, number=10000, lfc=5)) topTable(basal_ebFit, number=10, coef=2) nrow(topTable(basal_ebFit, coef=2, number=10000, lfc=5)) probeset.list <- topTable(basal_ebFit, coef=1, number=10000, lfc=4) biocLite("hgu133plus2.db") library(hgu133plus2.db) library(annotate) gene.symbols <- getSYMBOL(row.names(probeset.list), "hgu133plus2") results <- cbind(probeset.list, gene.symbols) head(results) gene_list <- topTable(basal_ebFit, coef=1, number=1000000, sort.by="logFC") gene.symbols <- getSYMBOL(row.names(gene_list), "hgu133plus2") gene_list <- cbind(gene_list, gene.symbols) head(gene_list$logFC) head(gene_list$P.Value) ##The par command sets "nice" graphical defaults par(mar=c(3,3,2,1), mgp=c(2,.7,0), tck=-.01) plot(gene_list$logFC, -log10(gene_list$P.Value), xlim=c(-10, 10), ylim=c(0, 15), #Set limits xlab="log2 fold change", ylab="-log10 p-value")#Set axis labels sum(abs(gene_list$logFC) > 2 & gene_list$P.Value < 0.001) ##no_of_genes=27,306 no_of_genes = dim(probeset.list)[1] ##Gives 693 genes sum(abs(gene_list$logFC) > 2 & gene_list$P.Value < 0.05/no_of_genes) install.packages("ggplot2") require(ggplot2) ##Highlight genes that have an absolute fold change > 2 and a p-value < Bonferroni cut-off gene_list$threshold = as.factor(abs(gene_list$logFC) > 2 & gene_list$P.Value < 0.05/no_of_genes) ##Construct the plot object g = ggplot(data=gene_list, aes(x=lproogFC, y=-log10(P.Value), colour=threshold)) + geom_point(alpha=0.4, size=1.75) + theme(legend.position = "none") + xlim(c(-10, 10)) + ylim(c(0, 15)) + xlab("log2 fold change") + ylab("-log10 p-value") g ind <- which(gene_list$gene.symbols == 'ESR1') ##Graph not shown ##g + geom_text(aes(x=gene_list[ind[1],]$logFC, y=-log10(gene_list[ind[1],]$P.Value), label=gene_list[ind[1],]$gene.symbols, size=1.2), colour="black") g + geom_text(aes(x=gene_list[ind,]$logFC, y=-log10(gene_list[ind,]$P.Value), label=gene_list[ind,]$gene.symbols, size=1.2), colour="black")