#' Read TCGA breast cancer data nd write out raw count data and TPM expression data. #' Read the TCGA breast cancer expression data. (raw file is 1.5 GB) tpmDatMat <- read.delim("/media/user/My Passport/UofC_backup/bionimbus/data_scratch/finalData/dataIn/rnaSeq/gdac.broadinstitute.org_BRCA.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.Level_3.2015082100.0.0/BRCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt", as.is=T) #' Pull out the raw read count data from the columns "raw_count" brcaRawCounts <- apply(tpmDatMat[-1,which(tpmDatMat[1,] == "raw_count")], 2, as.numeric) brcaRawCounts <- tpmDatMat[-1,which(tpmDatMat[1,] == "raw_count")] brcaRawCounts <- apply(brcaRawCounts, 2, as.numeric) #' Pull out the "scaled_estimates", these are scaled for depth (i.e. total reads in the sample) and for gene length in kilobases. brcaScaledEsts <- apply(tpmDatMat[-1,which(tpmDatMat[1,] == "scaled_estimate")], 2, as.numeric) brcaScaledEsts <- tpmDatMat[-1,which(tpmDatMat[1,] == "scaled_estimate")] brcaScaledEsts <- apply(brcaScaledEsts, 2, as.numeric) brcaLogTpm <- log((brcaScaledEsts*1000000)+1) # convert to log transformed TPM values. #' How do total read counts vary by sample? totalReadCountsPerRow <- apply(brcaRawCounts, 2, sum) #' Assign row names (genes) and column names (sample IDs) to this matrix of data geneNames <- do.call(cbind, strsplit(tpmDatMat[, "Hybridization.REF"], "|", fixed=TRUE))[1,][-1] rownames(brcaRawCounts) <- geneNames colnames(brcaRawCounts) <- substr(colnames(brcaRawCounts), 1, 28) rownames(brcaLogTpm) <- geneNames colnames(brcaLogTpm) <- substr(colnames(brcaRawCounts), 1, 28) #' Load the clinical data and pull out HER2, ER and PR status by immunohistochemistry clinicalDataLocation <- "/media/user/My Passport/UofC_backup/bionimbus/data_scratch/finalData/dataIn/clinical/nationwidechildrens.org_clinical_patient_brca.txt" clinDataBrca <- read.delim(clinicalDataLocation, as.is=T) her2status <- clinDataBrca[, "her2_status_by_ihc"] names(her2status) <- clinDataBrca[, "bcr_patient_barcode"] erStatus <- clinDataBrca[, "er_status_by_ihc"] names(erStatus) <- clinDataBrca[, "bcr_patient_barcode"] prStatus <- clinDataBrca[, "pr_status_by_ihc"] names(prStatus) <- clinDataBrca[, "bcr_patient_barcode"] #' Identify the tumor samples, tumor samples annotated as "01" by TCGA, normal samples as "10". We need to filter the normal samples. sampleNames <- colnames(brcaLogTpm) theTumorSamples <- which(substring(sampleNames, 14, 16) == "01A") brcaRawCounts_cancer <- brcaRawCounts[, theTumorSamples] brcaLogTpm_cancer <- brcaLogTpm[, theTumorSamples] #' We need to format the column names on our expression data, such that they are formatted the same way as the clinical data below. newNames <- gsub(".", "-", substring(colnames(brcaLogTpm_cancer), 1, 12), fixed=T) colnames(brcaRawCounts_cancer) <- newNames colnames(brcaLogTpm_cancer) <- newNames #' Identify HER2 postitive and negative samples, we will use these in our downstream analysis. sampsInBothDatasets <- clinDataBrca[, "bcr_patient_barcode"][clinDataBrca[, "bcr_patient_barcode"] %in% newNames] her2Neg <- names(which(her2status[sampsInBothDatasets] == "Negative")) her2Pos <- names(which(her2status[sampsInBothDatasets] == "Positive")) #' Create matrices of raw count data and log-tpm data for the first 15 HER2 positive and first 15 HER2 negative patients. Also write out their ER and PR status. brcaHer2sTpm <- cbind(brcaLogTpm_cancer[, her2Pos[1:15]], brcaLogTpm_cancer[, her2Neg[1:15]])[-(1:29),] # (also drop the first 30 rows with unknown gene names) brcaHer2sCounts <- cbind(brcaRawCounts_cancer[, her2Pos[1:15]], brcaRawCounts_cancer[, her2Neg[1:15]])[-(1:29),] # (also drop the first 30 rows with unknown gene names) erStatusThese <- erStatus[colnames(brcaHer2sTpm)] prStatusThese <- prStatus[colnames(brcaHer2sTpm)] her2StatusThese <- c(rep("positive", 15), rep("negative", 15)) #' Make a PCA plot. plot(prcomp(t(brcaHer2sTpm))$x, col=c(rep("red", 15), rep("blue", 15)), pch=prStatusThese) #' Save the TPM and raw read counts files at the end here. We'll load these and use them in the other script / in GSEA. save(brcaHer2sTpm, brcaHer2sCounts, erStatusThese, her2StatusThese, prStatusThese, file="/mnt/33f4cbbd-5a9d-47f3-826b-7c741caf0f8d/teachingData/brcaHer2.RData") #' Write out the data for GSEA # First, the .gct file with the data. gctFrame <- data.frame(NAME=rownames(brcaHer2sTpm), Description=rep("na", nrow(brcaHer2sTpm)), brcaHer2sTpm) theHeader <- paste("#1.2\n", nrow(brcaHer2sTpm), "\t", ncol(brcaHer2sTpm), sep="") # GSEA requires the .gct file with the expression data contain the following header. write.table_with_header <- function(x, file, header, ...) # a function to write a file with a header. Credit source: http://stackoverflow.com/questions/12381117/add-header-to-file-created-by-write-csv { cat(header, '\n', file = file) write.table(x, file, append = T, ...) } write.table_with_header(gctFrame, file="/mnt/33f4cbbd-5a9d-47f3-826b-7c741caf0f8d/teachingData/her2Expr.gct", theHeader, row.names=FALSE, col.names=TRUE, sep="\t", quote=FALSE) # Now the phenotype file, that denotes what's what clsFileHeader <- "30 2 1\n# HER2Pos HER2Neg" write.table_with_header(t(c(rep(0, 15), rep(1, 15))), file="/mnt/33f4cbbd-5a9d-47f3-826b-7c741caf0f8d/teachingData/her2Labels.cls", clsFileHeader, row.names=FALSE, col.names=FALSE, sep="\t", quote=FALSE)