GeneABEL Exercise

From Center for Statistical Genetics

Jump to: navigation, search

GeneABEL Exercise

R:

 # Load files
 library(GenABEL)
 convert.snp.tped(tped = "gwa_gabel_qtl.tped", tfam = "gwa_gabel_qtl.tfam", out = "gwa_gabel_qtl.raw", strand = "u")
 g.dat <- load.gwaa.data(phen = "gwa_gabel_qtl.praw", gen = "gwa_gabel_qtl.raw", force = T)
 slotNames(g.dat)
 slotNames(g.dat@gtdata)
 colnames(g.dat@phdata)
 # sample size
 sample.size <- g.dat@gtdata@nids
 # number of SNPs
 snps.total <- g.dat@gtdata@nsnps
 print(c(sample.size, snps.total)) 
 # Trait
 summary(g.dat@phdata$disease)
 hist(g.dat@phdata$disease, main="Quantitative Phenotype data summary", xlab = "Systolic pressure measure", freq = F,breaks=20, col="gray")
 rug(g.dat@phdata$disease) 
 ###
 # tests for association
 ###
 # GLM test
 test.snp <- scan.glm('disease ~ CRSNP', family = gaussian(), data = g.dat)
 names(test.snp)
 alpha <- 5e-8  
 test.snp$snpnames[test.snp$P1df < alpha]
 test.snp$P1df[test.snp$P1df < alpha]
 # Score test
 test.qt <- qtscore(disease, data = g.dat, trait = "gaussian")
 slotNames(test.qt)
 names(test.qt@results)
 test.qt@lambda
 descriptives.scan(test.qt)
 rownames(results(test.qt))[results(test.qt)$P1df < alpha]
 results(test.qt)$P1df[results(test.qt)$P1df < alpha] 
 results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha]
 # QQ plot
 obs <- sort(results(test.qt)$P1df) 
 ept <- c(1:length(obs)) / (length(obs) + 1) 
 plot(-log10(ept), -log10(obs), main = "GWAS QQ plot, qtl", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)")
 abline(0, 1, col = "red")
 abline(h = 8, lty = 2)
 # Manhattan plot        
 plot(test.qt, col = "black")
 # Adding confounders
 test.qt.sex <- qtscore(disease ~ sex, data = g.dat, trait = "gaussian")
 rownames(results(test.qt.sex))[results(test.qt)$P1df < alpha]
 summary(lm(disease ~ sex, data = g.dat))
 ###
 # MDS
 ###
 gkin <- ibs(g.dat, weight = "freq")
 gkin[1:10,1:10]
 cps.full <- cmdscale(as.dist(.5 - gkin), eig = T, k = 10)
 names(cps.full) 
 cps <- cps.full$points 
 plot(cps[,1], cps[,2], pch = g.dat@phdata$popn)
 legend(-0.16, 0.06, c("TSI","MEX", "CEU"), pch = c(1,2,3))  
 ###
 # Corrected test
 ###
 # Incorporating PCs as predictors
 colnames(cps)<-c('C1','C2','C3','C4','C5','C6','C7','C8','C9','C10') 
 gpc.dat <- g.dat
 gpc.dat@phdata<-cbind(g.dat@phdata, cps)
 test.pc.a <- scan.glm('disease ~ CRSNP + C1 + C2 + C3 + C4 + C5', family=gaussian(), data = gpc.dat) 
 test.pc.a$snpnames[test.pc.a$P1df < alpha]
 test.pc.a$P1df[test.pc.a$P1df < alpha]
 test.pc.b <- qtscore(disease ~  C1 + C2 + C3 + C4 + C5, data = gpc.dat, trait = "gaussian")
 test.pc.b@lambda
 # scree plot
 plot(cps.full$eig[1:10]/sum(cps.full$eig), axes = F, type = "b", xlab = "Components",  ylim = c(0,0.05), ylab = "Proportion of Variations", main = "MDS analysis scree plot") 
 axis(1, 1:10)
 axis(2)
 # cumulative plot
 plot(cumsum(cps.full$eig[1:10])/sum(cps.full$eig), axes = F, type = "b", ylim = c(0,0.2), xlab = "Components", ylab = "Proportion of Variations", main = "MDS analysis cumulative plot") 
 axis(1, 1:10)
 axis(2)
 # Genomic control
 # Uncorrected GIF
 test.qt@lambda 
 # Corrected p-value
 row.names(results(test.qt))[results(test.qt)$Pc1df < alpha]
 results(test.qt)$Pc1df[results(test.qt)$Pc1df < alpha]
 # Check for inflation of statistic 
 obs <- sort(results(test.qt)$chi2.1df)
 ept <- sort(qchisq(1:length(obs) / (length(obs) + 1), df = 1)) 
 plot(ept, obs, main = "Genomic control (slope is the inflation factor)", xlab="Expected chisq, 1df", ylab="Observed chisq, 1df")
 abline(0, 1, col = "red")
 abline(0, test.qt@lambda[1], lty = 2)
 # Definition of GIF
 # Conventional definition
 median(results(test.qt)$chi2.1df)/0.456
 # GenABEL definition
 lm(obs~ept)$coef[2]
 # QQ plot
 obs <- sort(results(test.qt)$Pc1df)
 ept <- c(1:length(obs)) / (length(obs) + 1)
 plot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. via Genomic Control", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)")
 abline(0, 1, col = "red")
 abline(h = 8, lty = 2)
 # EIGENSTRAT
 adj.gkin = gkin
 diag(adj.gkin) = hom(g.dat)$Var
 # naxes = 3 is default value
 test.eg <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = 2)
 descriptives.scan(test.eg)
 snp.eg <- row.names(results(test.eg))[results(test.eg)$P1df < alpha]
 pvalue.eg <- results(test.eg)$P1df[results(test.eg)$P1df < alpha]
 lambda.eg <- test.eg@lambda
 snp.eg 
 pvalue.eg
 lambda.eg
 # Change #PCs
 for (k in 1:10){ 
 test.tmp <- egscore(disease, data = g.dat, kin = adj.gkin, naxes = k)
 print(test.tmp@lambda$estimate)
 }
 # QQ plot
 obs <- sort(results(test.eg)$Pc1df)
 ept <- c(1:length(obs)) / (length(obs) + 1) 
 qqplot(-log10(ept), -log10(obs), main = "GWAS QQ plot adj. w/ EIGENSTRAT", xlab="Expected -log10(pvalue)", ylab="Observed -log10(pvalue)")
 abline(0, 1, col = "red")
 abline(h = 8, lty = 2)
 # Manhattan plot comparison
 plot(test.qt, col = "black")
 add.plot(test.eg, col = "gray", pch = 3)
 legend("topright", c("Original plot","After correction w/ EIGENSTRAT"), pch = c(1,3))  
 ###
 # Basic test, binary trait
 ###
 # load files to GenABEL
 convert.snp.tped(tped = "gwa_gabel.tped", tfam = "gwa_gabel.tfam", out = "gwa_gabel.raw", strand = "u")
 b.dat <- load.gwaa.data(phen = "gwa_gabel.praw", gen = "gwa_gabel.raw", force = T)
 slotNames(b.dat)
 slotNames(b.dat@gtdata)
 colnames(b.dat@phdata)
 # sample size
 b.dat@gtdata@nids
 # number of cases and controls
 case.size <- length(which(b.dat@phdata$disease == 1))
 control.size <- length(which(b.dat@phdata$disease == 0))
 case.size 
 control.size 
 # number of SNPs
 snpsb.total <- b.dat@gtdata@nsnps
 # GLM test
 testb.snp <- scan.glm('disease ~ CRSNP', family = binomial(), data = b.dat)
 names(testb.snp)  
 alpha <- 5e-8
 testb.snp$snpnames[testb.snp$P1df < alpha]
 testb.snp$P1df[testb.snp$P1df < alpha]
 # Score test
 testb.qt <- qtscore(disease, data = b.dat, trait = "binomial")
 slotNames(testb.qt)
 descriptives.scan(testb.qt)
 row.names(results(testb.qt))[results(testb.qt)$P1df < alpha]
 results(testb.qt)$P1df[results(testb.qt)$P1df < alpha] 
 results(testb.qt)$Pc1df[results(testb.qt)$Pc1df < alpha]