poissonisfish

Pingback: Genome-wide association studies in R – Cloud Data Architect

Pingback: Genome-wide association studies in R – Mubashir Qasim

2017-10-10T08:11:50+00:00

Wow! Talk about timely! I have an honors student in my undergraduate biostatistics course who is just beginning to write an introductory “GAA for Dummies” tutorial for genome-wide association analysis. This article is a gold mine of information and references. Thanks.

	library(snpStats)
	load("conversionTable.RData")

	pathM <- paste("public/Genomics/108Malay_2527458snps", c(".bed", ".bim", ".fam"), sep = "")
	SNP_M <- read.plink(pathM[1], pathM[2], pathM[3])

	pathI <- paste("public/Genomics/105Indian_2527458snps", c(".bed", ".bim", ".fam"), sep = "")
	SNP_I <- read.plink(pathI[1], pathI[2], pathI[3])

	pathC <- paste("public/Genomics/110Chinese_2527458snps", c(".bed", ".bim", ".fam"), sep = "")
	SNP_C <- read.plink(pathC[1], pathC[2], pathC[3])

	# Ensure == number of markers across the three populations
	if(ncol(SNP_C$genotypes) != ncol(SNP_I$genotypes)){
	stop("Different number of columns in input files detected. This is not allowed.")
	}
	if(ncol(SNP_I$genotypes) != ncol(SNP_M$genotypes)){
	stop("Different number of columns in input files detected. This is not allowed.")
	}

	# Merge the three SNP datasets
	SNP <- SNP_M
	SNP$genotypes <- rbind(SNP_M$genotypes, SNP_I$genotypes, SNP_C$genotypes)
	colnames(SNP$map) <- c("chr", "SNP", "gen.dist", "position", "A1", "A2") # same for all three
	SNP$fam<- rbind(SNP_M$fam, SNP_I$fam, SNP_C$fam)

	# Rename SNPs present in the conversion table into rs IDs
	mappedSNPs <- intersect(SNP$map$SNP, names(conversionTable))
	newIDs <- conversionTable[match(SNP$map$SNP[SNP$map$SNP %in% mappedSNPs], names(conversionTable))]
	SNP$map$SNP[rownames(SNP$map) %in% mappedSNPs] <- newIDs

	# Load lipid datasets & match SNP-Lipidomics samples
	lipidsMalay <- read.delim("public/Lipidomic/117Malay_282lipids.txt", row.names = 1)
	lipidsIndian <- read.delim("public/Lipidomic/120Indian_282lipids.txt", row.names = 1)
	lipidsChinese <- read.delim("public/Lipidomic/122Chinese_282lipids.txt", row.names = 1)

	all(Reduce(intersect, list(colnames(lipidsMalay),
	colnames(lipidsIndian),
	colnames(lipidsChinese))) == colnames(lipidsMalay)) # TRUE
	lip <- rbind(lipidsMalay, lipidsIndian, lipidsChinese)

	# Country
	country <- sapply(list(SNP_M, SNP_I, SNP_C), function(k){
	nrow(k$genotypes)
	})
	origin <- data.frame(sample.id = rownames(SNP$genotypes),
	Country = factor(rep(c("M", "I", "C"), country)))

	matchingSamples <- intersect(rownames(lip), rownames(SNP$genotypes))
	SNP$genotypes <- SNP$genotypes[matchingSamples,]
	lip <- lip[matchingSamples,]
	origin <- origin[match(matchingSamples, origin$sample.id),]
	# Combine SNP and Lipidomics
	genData <- list(SNP = SNP$genotype, MAP = SNP$map, LIP = lip)

	# Write processed omics and GDS
	save(genData, origin, file = "PhenoGenoMap.RData")
	write.plink("convertGDS", snps = SNP$genotypes)

	# Clear memory
	rm(list = ls())

	library(snpStats)
	library(doParallel)
	library(SNPRelate)
	library(GenABEL)
	library(dplyr)
	source("GWASfunction.R")
	load("PhenoGenoMap.RData")

	# Use SNP call rate of 100%, MAF of 0.1 (very stringent)
	maf <- 0.1
	callRate <- 1
	SNPstats <- col.summary(genData$SNP)

	maf_call <- with(SNPstats, MAF > maf & Call.rate == callRate)
	genData$SNP <- genData$SNP[,maf_call]
	genData$MAP <- genData$MAP[maf_call,]
	SNPstats <- SNPstats[maf_call,]

	# Sample call rate & heterozygosity
	callMat <- !is.na(genData$SNP)
	Sampstats <- row.summary(genData$SNP)
	hetExp <- callMat %% (2 SNPstats$MAF * (1 - SNPstats$MAF)) # Hardy-Weinberg heterozygosity (expected)
	hetObs <- with(Sampstats, Heterozygosity * (ncol(genData$SNP)) * Call.rate)
	Sampstats$hetF <- 1-(hetObs/hetExp)
	# Use sample call rate of 100%, het threshold of 0.1 (very stringent)
	het <- 0.1 # Set cutoff for inbreeding coefficient;
	het_call <- with(Sampstats, abs(hetF) < het & Call.rate == 1)
	genData$SNP <- genData$SNP[het_call,]
	genData$LIP <- genData$LIP[het_call,]

	# LD and kinship coeff
	ld <- .2
	kin <- .1
	snpgdsBED2GDS(bed.fn = "convertGDS.bed", bim.fn = "convertGDS.bim",
	fam.fn = "convertGDS.fam", out.gdsfn = "myGDS",
	cvt.chr = "char")
	genofile <- snpgdsOpen("myGDS", readonly = F)
	gds.ids <- read.gdsn(index.gdsn(genofile, "sample.id"))
	gds.ids <- sub("-1", "", gds.ids)
	add.gdsn(genofile, "sample.id", gds.ids, replace = T)
	geno.sample.ids <- rownames(genData$SNP)
	# First filter for LD
	snpSUB <- snpgdsLDpruning(genofile, ld.threshold = ld,
	sample.id = geno.sample.ids,
	snp.id = colnames(genData$SNP))
	snpset.ibd <- unlist(snpSUB, use.names = F)
	# And now filter for MoM
	ibd <- snpgdsIBDMoM(genofile, kinship = T,
	sample.id = geno.sample.ids,
	snp.id = snpset.ibd,
	num.thread = 1)
	ibdcoef <- snpgdsIBDSelection(ibd)
	ibdcoef <- ibdcoef[ibdcoef$kinship >= kin,]

	# Filter samples out
	related.samples <- NULL
	while (nrow(ibdcoef) > 0) {
	# count the number of occurrences of each and take the top one
	sample.counts <- sort(table(c(ibdcoef$ID1, ibdcoef$ID2)), decreasing = T)
	rm.sample <- names(sample.counts)[1]
	cat("Removing sample", rm.sample, "too closely related to",
	sample.counts[1], "other samples.\n")

	# remove from ibdcoef and add to list
	ibdcoef <- ibdcoef[ibdcoef$ID1 != rm.sample & ibdcoef$ID2 != rm.sample,]
	related.samples <- c(as.character(rm.sample), related.samples)
	}
	genData$SNP <- genData$SNP[!(rownames(genData$SNP) %in% related.samples),]
	genData$LIP <- genData$LIP[!(rownames(genData$LIP) %in% related.samples),]

	# PCA
	set.seed(100)
	pca <- snpgdsPCA(genofile, sample.id = geno.sample.ids,
	snp.id = snpset.ibd, num.thread = 1)
	pctab <- data.frame(sample.id = pca$sample.id,
	PC1 = pca$eigenvect[,1],
	PC2 = pca$eigenvect[,2],
	stringsAsFactors = F)

	# Subset and/or reorder origin accordingly
	origin <- origin[match(pca$sample.id, origin$sample.id),]

	pcaCol <- rep(rgb(0,0,0,.3), length(pca$sample.id)) # Set black for chinese
	pcaCol[origin$Country == "I"] <- rgb(1,0,0,.3) # red for indian
	pcaCol[origin$Country == "M"] <- rgb(0,.7,0,.3) # green for malay

	png("PCApopulation.png", width = 500, height = 500)
	plot(pctab$PC1, pctab$PC2, xlab = "PC1", ylab = "PC2", col = pcaCol, pch = 16)
	abline(h = 0, v = 0, lty = 2, col = "grey")
	legend("top", legend = c("Chinese", "Indian", "Malay"), col = 1:3, pch = 16, bty = "n")
	dev.off()

	# Choose trait for association analysis, use colnames(genData$LIP) for listing
	# NOTE: Ignore the first column of genData$LIP (gender)
	target <- "Cholesterol"

	phenodata <- data.frame("id" = rownames(genData$LIP),
	"phenotype" = scale(genData$LIP[,target]), stringsAsFactors = F)

	# Conduct GWAS (will take a while)
	start <- Sys.time()
	GWAA(genodata = genData$SNP, phenodata = phenodata, filename = paste(target, ".txt", sep = ""))
	Sys.time() - start # benchmark

	# Manhattan plot
	GWASout <- read.table(paste(target, ".txt", sep = ""), header = T, colClasses = c("character", rep("numeric",4)))
	GWASout$type <- rep("typed", nrow(GWASout))
	GWASout$Neg_logP <- -log10(GWASout$p.value)
	GWASout <- merge(GWASout, genData$MAP[,c("SNP", "chr", "position")])
	GWASout <- GWASout[order(GWASout$Neg_logP, decreasing = T),]

	png(paste(target, ".png", sep = ""), height = 500,width = 1000)
	GWAS_Manhattan(GWASout)
	dev.off()

	# QQ plot using GenABEL estlambda function
	png(paste(target, "_QQplot.png", sep = ""), width = 500, height = 500)
	lambda <- estlambda(GWASout$t.value**2, plot = T, method = "median")
	dev.off()

Historical background

Genome-wide association studies

Association mapping vs. linkage mapping

Let’s get started with R

Read data

Pre-processing

Analysis

Principal Component Analysis

Genome-Wide Association

Functional insights into candidate markers

Wrap-up

Citation

Share this:

51 thoughts on “Genome-wide association studies in R”

Leave a reply to Francisco de Abreu e Lima Cancel reply