To estimate the proportion of phenotypic variability explained by our GWAS data, we conducted a bunch of analysis. We also used two data sets to test how much variance our estimated SNP effects could explain in out-of-sample prediction.
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
geno=${WD}/PLINK/ukb_v3_eur_snp8m
grm=$WD/input/UKB_HM3_m01_sp
pheno=$WD/input/mlm_pheno
quantCovars=$WD/input/quantitativeCovariates
catgCovars=$WD/input/categoricalCovariates
prefix=vitD_fastGWA_1f
output=${tmpDir}/${prefix}
tmp_command="gcta64 --bfile $geno \
--fastGWA-lmm \
--pheno $pheno \
--grm-sparse $grm \
--qcovar $quantCovars\
--covar $catgCovars \
--covar-maxlevel 110 \
--threads 4 \
--out $output"
qsubshcom "$tmp_command" 4 40G $prefix 10:00:00 ""
Here is the heritability output from fastGWA.
Sampling variance/covariance of the estimates of Vg and Ve:
8.61385e-05 -8.41136e-05
-8.41136e-05 8.53273e-05
Source Variance SE Vg 0.264123 0.00928109 Ve 0.556691 0.00923728 Vp 0.820814 Heritability = 0.321782 (Pval = 3.85987e-178)
S.E. of hsq is not in log file from fastGWA. We used this formular to calculate:
se(Vg/Vp) = se(Vg) / Vp = 0.009 / 0.82 = 0.01
BMI was added into the quantitative covariates file.
library(data.table)
quantC = data.frame(fread("input/quantitativeCovariates"))
pheno = data.frame(fread("input/vitD_phenotypes"))
quantC$bmi = pheno[match(quantC$IID, pheno$IID),"bmi"]
Then we run fastGWA.
prefix=vitD_fastGWA_1f_with_BMI
output=${tmpDir}/${prefix}
quantCovars=$WD/input/quantitativeCovariates
catgCovars=$WD/input/categoricalCovariates
tmp_command="gcta64 --bfile $geno \
--fastGWA-lmm \
--pheno $pheno \
--grm-sparse $grm \
--qcovar ${quantCovars}_plusBMI \
--covar $catgCovars \
--covar-maxlevel 110 \
--threads 4 \
--out $output"
qsubshcom "$tmp_command" 4 40G $prefix 10:00:00 ""
Here is the heritability output from fastGWA.
Sampling variance/covariance of the estimates of Vg and Ve:
8.0854e-05 -7.89775e-05
-7.89775e-05 8.01352e-05
Source Variance SE Vg 0.252569 0.00899189 Ve 0.540679 0.00895183 Vp 0.793248 Heritability = 0.318398 (Pval = 1.35263e-173)
S.E. of hsq is not in log file from fastGWA. We used this formular to calculate:
se(Vg/Vp) = se(Vg) / Vp = 0.009 / 0.79 = 0.01
We used the 50k unrelated v3EURu samples, and used only the SNPs with MAF > 1E-04 to generate LD score.
id=unrelated.50k.fam
prefix=subset
## substract the plink file
tmp_command="gcta64 \
--bfile v3EURu_impQC/ukbEURu_imp_chr{TASK_ID}_v3_impQC \
--keep $id \
--maf 0.0001 \
--thread-num 10 \
--make-bed \
--out PLINK/ukbEURu_imp_chr{TASK_ID}_v3_impQC "
qsubshcom "$tmp_command" 10 40G $prefix 20:00:00 "-array=1-22"
## calculate LD score.
prefix=LDscore_local
tmp_command="gcta64 \
--bfile PLINK/ukbEURu_imp_chr{TASK_ID}_v3_impQC \
--ld-score-region 200 \
--thread-num 4 \
--out LDscore_v3EURu_impQC_50k_local/ukbEURu_imp_chr{TASK_ID}_v3_impQC_LDscore "
qsubshcom "$tmp_command" 4 120G $prefix 20:00:00 "-array=1-22"
library(data.table)
ukb.ld.c = NULL
for (i in 1:22){
ukb.ld = fread(paste0("LDscore_v3EURu_impQC_50k_local/ukbEURu_imp_chr", i, "_v3_impQC_LDscore.score.ld"), header = T)
ukb.ld.c = rbind(ukb.ld.c, ukb.ld)
}
ukb.ld.c = data.frame(ukb.ld.c)
ukb.ld.c$MAF = NA
ukb.ld.c[ukb.ld.c$freq > 0.5,"MAF"] = 1 - ukb.ld.c[ukb.ld.c$freq > 0.5,"freq"]
ukb.ld.c[ukb.ld.c$freq <= 0.5, "MAF"] = ukb.ld.c[ukb.ld.c$freq <= 0.5,"freq"]
png("MAF_ldscore.png")
hist(ukb.ld.c$MAF)
dev.off()
png("MAF_in_H_L_ldscore.png")
par(mfrow=c(2,1))
hist(ukb.ld.c[ukb.ld.c$ldscore_SNP >= median(ukb.ld.c$ldscore_SNP) ,]$MAF)
hist(ukb.ld.c[ukb.ld.c$ldscore_SNP < median(ukb.ld.c$ldscore_SNP) ,]$MAF)
dev.off()
####################
## check distribution
library(data.table)
ukb.ld.c = data.frame(fread("ukb.SNP.maf01.txt", header = T))
n.snp.5bin = data.frame(c(
nrow(ukb.ld.c[ukb.ld.c$MAF >=0.01 & ukb.ld.c$MAF < 0.1,]),
nrow(ukb.ld.c[ukb.ld.c$MAF >=0.1 & ukb.ld.c$MAF < 0.2,]),
nrow(ukb.ld.c[ukb.ld.c$MAF >=0.2 & ukb.ld.c$MAF < 0.3,]),
nrow(ukb.ld.c[ukb.ld.c$MAF >=0.3 & ukb.ld.c$MAF < 0.4,]),
nrow(ukb.ld.c[ukb.ld.c$MAF >=0.4 & ukb.ld.c$MAF < 0.5,])))
colnames(n.snp.5bin) = "n.SNP"
n.snp.5bin$bin = c("0.01-0.1", "0.1-0.2", "0.2-0.3", "0.3-0.4","0.4-0.5")
n.snp.5bin.plot <- ggplot(n.snp.5bin, aes(x=bin, y=n.SNP)) +
geom_bar(stat="identity",
position=position_dodge()) +
labs(title="number of SNPs in each bin", x="MAF threshold", y = "number of SNPs")+
theme_classic()
####################
## stratify MAF
group1 = ukb.ld.c[ukb.ld.c$MAF >=0.01 & ukb.ld.c$MAF < 0.1,]
group2 = ukb.ld.c[ukb.ld.c$MAF >=0.1 & ukb.ld.c$MAF < 0.2,]
group3 = ukb.ld.c[ukb.ld.c$MAF >=0.2 & ukb.ld.c$MAF < 0.3,]
group4 = ukb.ld.c[ukb.ld.c$MAF >=0.3 & ukb.ld.c$MAF < 0.4,]
group5 = ukb.ld.c[ukb.ld.c$MAF >=0.4 & ukb.ld.c$MAF <= 0.5,]
## stratify further with LD median
group11 = group1[group1$ldscore_SNP >= median(group1$ldscore_SNP) ,]
group12 = group1[group1$ldscore_SNP < median(group1$ldscore_SNP) ,]
group21 = group2[group2$ldscore_SNP >= median(group2$ldscore_SNP) ,]
group22 = group2[group2$ldscore_SNP < median(group2$ldscore_SNP) ,]
group31 = group3[group3$ldscore_SNP >= median(group3$ldscore_SNP) ,]
group32 = group3[group3$ldscore_SNP < median(group3$ldscore_SNP) ,]
group41 = group4[group4$ldscore_SNP >= median(group4$ldscore_SNP) ,]
group42 = group4[group4$ldscore_SNP < median(group4$ldscore_SNP) ,]
group51 = group5[group5$ldscore_SNP >= median(group5$ldscore_SNP) ,]
group52 = group5[group5$ldscore_SNP < median(group5$ldscore_SNP) ,]
## stratify all snps with LD median
ukb.ld.c.01 = ukb.ld.c[ukb.ld.c$MAF >= 0.01,]
group.high = ukb.ld.c.01[ukb.ld.c.01$ldscore_SNP >= median(ukb.ld.c.01$ldscore_SNP) ,]
group.low = ukb.ld.c.01[ukb.ld.c.01$ldscore_SNP < median(ukb.ld.c.01$ldscore_SNP) ,]
## output SNP lists
write.table(group11$SNP, "SNP_lists/hig.ld.SNPs_group11.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group12$SNP, "SNP_lists/low.ld.SNPs_group12.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group21$SNP, "SNP_lists/hig.ld.SNPs_group21.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group22$SNP, "SNP_lists/low.ld.SNPs_group22.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group31$SNP, "SNP_lists/hig.ld.SNPs_group31.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group32$SNP, "SNP_lists/low.ld.SNPs_group32.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group41$SNP, "SNP_lists/hig.ld.SNPs_group41.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group42$SNP, "SNP_lists/low.ld.SNPs_group42.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group51$SNP, "SNP_lists/hig.ld.SNPs_group51.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group52$SNP, "SNP_lists/low.ld.SNPs_group52.txt", quote = F, sep ="\t", row.names = F, col.names = F)
## output SNP lists for MAF GREML
write.table(group1$SNP, "SNP_lists/SNPs_group1.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group2$SNP, "SNP_lists/SNPs_group2.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group3$SNP, "SNP_lists/SNPs_group3.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group4$SNP, "SNP_lists/SNPs_group4.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group5$SNP, "SNP_lists/SNPs_group5.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group.high$SNP, "SNP_lists/SNPs_group.high.txt", quote = F, sep ="\t", row.names = F, col.names = F)
write.table(group.low$SNP, "SNP_lists/SNPs_group.low.txt", quote = F, sep ="\t", row.names = F, col.names = F)
We generated GRMs using the SNPs in each stratification. GRM are generated for 50k unrelated individuals and 50k unrelated individuals measured in summer.
## for 50k unrelated samples
command11="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract LDMS_5by2/hig.ld.SNPs_group11.txt --thread-num 10 --make-grm --out LDMS_5by2/hig.ld.SNPs_group11_50k_GRM"
command12="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract LDMS_5by2/low.ld.SNPs_group12.txt --thread-num 10 --make-grm --out LDMS_5by2/low.ld.SNPs_group12_50k_GRM"
command21="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract LDMS_5by2/hig.ld.SNPs_group21.txt --thread-num 10 --make-grm --out LDMS_5by2/hig.ld.SNPs_group21_50k_GRM"
command22="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract LDMS_5by2/low.ld.SNPs_group22.txt --thread-num 10 --make-grm --out LDMS_5by2/low.ld.SNPs_group22_50k_GRM"
command31="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract LDMS_5by2/hig.ld.SNPs_group31.txt --thread-num 10 --make-grm --out LDMS_5by2/hig.ld.SNPs_group31_50k_GRM"
command32="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract LDMS_5by2/low.ld.SNPs_group32.txt --thread-num 10 --make-grm --out LDMS_5by2/low.ld.SNPs_group32_50k_GRM"
command41="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract LDMS_5by2/hig.ld.SNPs_group41.txt --thread-num 10 --make-grm --out LDMS_5by2/hig.ld.SNPs_group41_50k_GRM"
command42="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract LDMS_5by2/low.ld.SNPs_group42.txt --thread-num 10 --make-grm --out LDMS_5by2/low.ld.SNPs_group42_50k_GRM"
command51="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract LDMS_5by2/hig.ld.SNPs_group51.txt --thread-num 10 --make-grm --out LDMS_5by2/hig.ld.SNPs_group51_50k_GRM"
command52="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract LDMS_5by2/low.ld.SNPs_group52.txt --thread-num 10 --make-grm --out LDMS_5by2/low.ld.SNPs_group52_50k_GRM"
command1="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract MAF_GREML/SNPs_group1.txt --thread-num 10 --make-grm --out MAF_GREML/SNPs_group1_50k_GRM"
command2="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract MAF_GREML/SNPs_group2.txt --thread-num 10 --make-grm --out MAF_GREML/SNPs_group2_50k_GRM"
command3="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract MAF_GREML/SNPs_group3.txt --thread-num 10 --make-grm --out MAF_GREML/SNPs_group3_50k_GRM"
command4="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract MAF_GREML/SNPs_group4.txt --thread-num 10 --make-grm --out MAF_GREML/SNPs_group4_50k_GRM"
command5="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract MAF_GREML/SNPs_group5.txt --thread-num 10 --make-grm --out MAF_GREML/SNPs_group5_50k_GRM"
commandh="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract LD_GREML/SNPs_group.high.txt --thread-num 10 --make-grm --out LD_GREML/SNPs_group.hig_50k_GRM"
commandl="gcta64 --mbfile ukbEURu_imp_mbfile.txt --extract LD_GREML/SNPs_group.low.txt --thread-num 10 --make-grm --out LD_GREML/SNPs_group.low_50k_GRM"
qsubshcom "$command11" 10 80G "GRM_11" 20:00:00 ""
qsubshcom "$command12" 10 80G "GRM_12" 20:00:00 ""
qsubshcom "$command21" 10 80G "GRM_21" 20:00:00 ""
qsubshcom "$command22" 10 80G "GRM_22" 20:00:00 ""
qsubshcom "$command31" 10 80G "GRM_31" 20:00:00 ""
qsubshcom "$command32" 10 80G "GRM_32" 20:00:00 ""
qsubshcom "$command41" 10 80G "GRM_41" 20:00:00 ""
qsubshcom "$command42" 10 80G "GRM_42" 20:00:00 ""
qsubshcom "$command51" 10 80G "GRM_51" 20:00:00 ""
qsubshcom "$command52" 10 80G "GRM_52" 20:00:00 ""
qsubshcom "$command1" 10 80G "GRM_1" 20:00:00 ""
qsubshcom "$command2" 10 80G "GRM_2" 20:00:00 ""
qsubshcom "$command3" 10 80G "GRM_3" 20:00:00 ""
qsubshcom "$command4" 10 80G "GRM_4" 20:00:00 ""
qsubshcom "$command5" 10 80G "GRM_5" 20:00:00 ""
qsubshcom "$commandh" 10 80G "GRM_h" 20:00:00 ""
qsubshcom "$commandl" 10 80G "GRM_l" 20:00:00 ""
## for 50k unrelated individuals measured in summer.
command11="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract LDMS_5by2/hig.ld.SNPs_group11.txt --thread-num 10 --make-grm --out LDMS_5by2/hig.ld.SNPs_group11_50ksummer_GRM"
command12="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract LDMS_5by2/low.ld.SNPs_group12.txt --thread-num 10 --make-grm --out LDMS_5by2/low.ld.SNPs_group12_50ksummer_GRM"
command21="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract LDMS_5by2/hig.ld.SNPs_group21.txt --thread-num 10 --make-grm --out LDMS_5by2/hig.ld.SNPs_group21_50ksummer_GRM"
command22="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract LDMS_5by2/low.ld.SNPs_group22.txt --thread-num 10 --make-grm --out LDMS_5by2/low.ld.SNPs_group22_50ksummer_GRM"
command31="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract LDMS_5by2/hig.ld.SNPs_group31.txt --thread-num 10 --make-grm --out LDMS_5by2/hig.ld.SNPs_group31_50ksummer_GRM"
command32="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract LDMS_5by2/low.ld.SNPs_group32.txt --thread-num 10 --make-grm --out LDMS_5by2/low.ld.SNPs_group32_50ksummer_GRM"
command41="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract LDMS_5by2/hig.ld.SNPs_group41.txt --thread-num 10 --make-grm --out LDMS_5by2/hig.ld.SNPs_group41_50ksummer_GRM"
command42="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract LDMS_5by2/low.ld.SNPs_group42.txt --thread-num 10 --make-grm --out LDMS_5by2/low.ld.SNPs_group42_50ksummer_GRM"
command51="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract LDMS_5by2/hig.ld.SNPs_group51.txt --thread-num 10 --make-grm --out LDMS_5by2/hig.ld.SNPs_group51_50ksummer_GRM"
command52="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract LDMS_5by2/low.ld.SNPs_group52.txt --thread-num 10 --make-grm --out LDMS_5by2/low.ld.SNPs_group52_50ksummer_GRM"
command1="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract MAF_GREML/SNPs_group1.txt --thread-num 10 --make-grm --out MAF_GREML/SNPs_group1_50ksummer_GRM"
command2="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract MAF_GREML/SNPs_group2.txt --thread-num 10 --make-grm --out MAF_GREML/SNPs_group2_50ksummer_GRM"
command3="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract MAF_GREML/SNPs_group3.txt --thread-num 10 --make-grm --out MAF_GREML/SNPs_group3_50ksummer_GRM"
command4="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract MAF_GREML/SNPs_group4.txt --thread-num 10 --make-grm --out MAF_GREML/SNPs_group4_50ksummer_GRM"
command5="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract MAF_GREML/SNPs_group5.txt --thread-num 10 --make-grm --out MAF_GREML/SNPs_group5_50ksummer_GRM"
commandh="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract LD_GREML/SNPs_group.high.txt --thread-num 10 --make-grm --out LD_GREML/SNPs_group.hig_50ksummer_GRM"
commandl="gcta64 --mbfile summer.ukbEURu_imp_mbfile.txt --extract LD_GREML/SNPs_group.low.txt --thread-num 10 --make-grm --out LD_GREML/SNPs_group.low_50ksummer_GRM"
qsubshcom "$command11" 10 80G "GRM_11" 20:00:00 ""
qsubshcom "$command12" 10 80G "GRM_12" 20:00:00 ""
qsubshcom "$command21" 10 80G "GRM_21" 20:00:00 ""
qsubshcom "$command22" 10 80G "GRM_22" 20:00:00 ""
qsubshcom "$command31" 10 80G "GRM_31" 20:00:00 ""
qsubshcom "$command32" 10 80G "GRM_32" 20:00:00 ""
qsubshcom "$command41" 10 80G "GRM_41" 20:00:00 ""
qsubshcom "$command42" 10 80G "GRM_42" 20:00:00 ""
qsubshcom "$command51" 10 80G "GRM_51" 20:00:00 ""
qsubshcom "$command52" 10 80G "GRM_52" 20:00:00 ""
qsubshcom "$command1" 10 80G "GRM_1_summer" 20:00:00 ""
qsubshcom "$command2" 10 80G "GRM_2_summer" 20:00:00 ""
qsubshcom "$command3" 10 80G "GRM_3_summer" 20:00:00 ""
qsubshcom "$command4" 10 80G "GRM_4_summer" 20:00:00 ""
qsubshcom "$command5" 10 80G "GRM_5_summer" 20:00:00 ""
qsubshcom "$commandh" 10 80G "GRM_h_summer" 20:00:00 ""
qsubshcom "$commandl" 10 80G "GRM_l_summer" 20:00:00 ""
GRMs are listed in files to run GREML_LDMS.
pheno=$WD/input/mlm_pheno
## in mgrm_LDMS.txt
quantCovars=$WD/input/quantitativeCovariates
catgCovars=$WD/input/categoricalCovariates
prefix=LDMS_5by2
tmp_command="gcta64 --reml \
--mgrm mgrm_5by2.txt \
--qcovar ${quantCovars} \
--covar $catgCovars \
--pheno $pheno \
--thread-num 10 \
--out ${prefix} "
qsubshcom "$tmp_command" 10 500G $prefix 20:00:00 ""
prefix=LDMS_5by2_summer
tmp_command="gcta64 --reml \
--mgrm mgrm_5by2_summer.txt \
--qcovar ${quantCovars} \
--covar $catgCovars \
--pheno $pheno \
--thread-num 10 \
--out ${prefix} "
qsubshcom "$tmp_command" 10 500G $prefix 20:00:00 ""
prefix=LDMS_5by2_covBMI
tmp_command="gcta64 --reml \
--mgrm mgrm_5by2.txt \
--qcovar ${quantCovars}_plusBMI \
--covar $catgCovars \
--pheno $pheno \
--thread-num 10 \
--out ${prefix} "
qsubshcom "$tmp_command" 10 500G $prefix 20:00:00 ""
prefix=LDMS_5by2_summer_covBMI
tmp_command="gcta64 --reml \
--mgrm mgrm_5by2_summer.txt \
--qcovar ${quantCovars}_plusBMI \
--covar $catgCovars \
--pheno $pheno \
--thread-num 10 \
--out ${prefix} "
qsubshcom "$tmp_command" 10 500G $prefix 20:00:00 ""
Here we run GREML with summer and winter measurements as two phenotypes.
We write vitD measurements in summer and winter into two columns in a phenotype file.
##get biv
library(data.table)
library(dplyr)
summer = data.frame(fread("input/mlm_summer_pheno"))
winter = data.frame(fread("input/mlm_winter_pheno"))
unrelated.50k = data.frame(fread("sample_lists/unrelated.50k.fam"))
biv.summer = summer[summer$IID%in%unrelated.50k$IID,1:3]
biv.summer$winter = NA
biv.winter = winter[winter$IID%in%unrelated.50k$IID,1:3]
biv.winter$summer = NA
colnames(biv.winter)[3] = "winter"
colnames(biv.summer)[3] = "summer"
biv.winter = biv.winter[,c(1,2,4,3)]
biv = rbind(biv.winter, biv.summer)
write.table(biv, file="mlm_bivariate_50k_pheno", quote = F, sep ="\t", row.names = F, col.names = F)
We ran the GREML analysis without Centre and asssessMonth as covariates, since the samples are not balanced in measurements in summer and winter.
awk '{print $1, $2, $3, $4, $6}' input/categoricalCovariates_withoutmonth > input/categoricalCovariates_withoutassess
pheno=$WD/mlm_bivariate_50k_pheno
quantCovars=$WD/input/quantitativeCovariates
catgCovars=$WD/input/categoricalCovariates_withoutassess
GRMpath=$WD/GRMs_subtracted_from_HM3_m01/
GRM=UKB_HM3_m01_unrelated_50k
prefix=bivGREML
tmp_command="gcta64 \
--reml-bivar \
--pheno $pheno \
--reml-bivar-no-constrain \
--grm ${GRMpath}/${GRM} \
--qcovar ${quantCovars} \
--covar ${catgCovars} \
--threads 10 \
--out vitD_bivariate_summer_winter_withoutassess "
qsubshcom "$tmp_command" 10 500G $prefix 20:00:00 ""
prefix=bivGREML_plusBMI
tmp_command="gcta64 \
--reml-bivar \
--pheno $pheno \
--reml-bivar-no-constrain \
--grm ${GRMpath}/${GRM} \
--qcovar ${quantCovars}_plusBMI \
--covar ${catgCovars} \
--threads 10 \
--out vitD_bivariate_summer_winter_with_BMI"
qsubshcom "$tmp_command" 10 500G $prefix 20:00:00 ""
This step is to confirm our anlaysis is correct. Result is not reported in publication.
library(data.table)
biv.cat = read.table("input/categoricalCovariates_without ", header = T)
summer = data.frame(fread("input/mlm_summer_pheno"))
winter = data.frame(fread("input/mlm_winter_pheno"))
biv.cat$season = as.numeric(biv.pheno$IID %in% summer$IID)
write.table(biv.pheno, file="input/categoricalCovariates_without _withseason", quote = F, sep ="\t", row.names = F, col.names = F)
quantCovars=$WD/input/quantitativeCovariates
catgCovars=$WD/input/categoricalCovariates_without_withseason
pheno=$WD/input/mlm_pheno
prefix=GREML_unrelated50k_onseason_benchmark
## without BMI
tmp_command="gcta64 --reml \
--grm ${WD}/GRMs_subtracted_from_HM3_m01/${GRM}_unrelated_50k \
--qcovar ${quantCovars} \
--covar ${catgCovars} \
--pheno ${pheno} \
--threads 10 \
--out ${prefix} "
qsubshcom "$tmp_command" 10 200G $prefix 30:00:00 ""
Here we write BMI and vitD into one phenotype file for both 50k unrelated samples and 50k unrelated samples measured in summer.
##get biv
library(data.table)
library(dplyr)
pheno = data.frame(fread("input/vitD_phenotypes", header = T))
mlm_pheno = data.frame(fread("input/mlm_pheno", header = T))
unrelated.50k = read.table("unrelated.50k.fam")
unrelated.50k$vitD = mlm_pheno[match(unrelated.50k$V2, mlm_pheno$IID),"norm_vitD"]
unrelated.50k$bmi = pheno[match(unrelated.50k$V2, pheno$IID), "bmi"]
write.table(unrelated.50k, file="mlm_bivariate_50k_vitD_bmi", quote = F, sep ="\t", row.names = F, col.names = F)
unrelated.50k.summer = read.table("unrelated.50k.summer.fam")
unrelated.50k.summer$vitD = mlm_pheno[match(unrelated.50k.summer$V2, mlm_pheno$IID),"norm_vitD"]
unrelated.50k.summer$bmi = pheno[match(unrelated.50k.summer$V2, pheno$IID), "bmi"]
write.table(unrelated.50k.summer, file="mlm_bivariate_50k.summer_vitD_bmi", quote = F, sep ="\t", row.names = F, col.names = F)
pheno=$WD/mlm_bivariate_5k_vitD_bmi
quantCovars=$WD/input/quantitativeCovariates
catgCovars=$WD/input/categoricalCovariates
## with all indi
pheno=$WD/mlm_bivariate_50k_vitD_bmi
prefix=bivGREMLVB_50k
tmp_command="gcta64 \
--reml-bivar \
--pheno $pheno \
--reml-bivar-no-constrain \
--grm GRMs_subtracted_from_HM3_m01/UKB_HM3_m01_unrelated_50k \
--qcovar ${quantCovars} \
--covar ${catgCovars} \
--threads 10 \
--out vitD_bivariate_vitD_BMI_50k"
qsubshcom "$tmp_command" 10 500G $prefix 50:00:00 ""
## with summer measures
quantCovars=$WD/input/quantitativeCovariates
catgCovars=$WD/input/categoricalCovariates
pheno=$WD/mlm_bivariate_50k.summer_vitD_bmi
prefix=bivGREMLVB_50ksummer
tmp_command="gcta64 \
--reml-bivar \
--pheno $pheno \
--reml-bivar-no-constrain \
--grm GRMs_subtracted_from_HM3_m01/UKB_HM3_m01_unrelated_50k.summer \
--qcovar ${quantCovars} \
--covar ${catgCovars} \
--threads 10 \
--out vitD_bivariate_vitD_BMI_50ksummer"
qsubshcom "$tmp_command" 10 500G $prefix 50:00:00 ""
rE = c(E)/ (V(e1)*V(e2))^0.5
re = -0.590901 / (0.711139*16.048647)^0.5
se = (Var(re))^0.5
SBayesS and SBayesR were done with our fastGWA result.
SBayesS was carried out using GCTB.
filename <- commandArgs(trailingOnly = TRUE)
library(data.table)
sum.data = data.frame(fread(filename, header = T))
sum.data.formatted = sum.data[,c("SNP", "A1", "A2", "AF1", "BETA", "SE", "P", "N")]
colnames(sum.data.formatted) = c("SNP", "A1", "A2", "freq", "b", "se", "p", "N")
write.table(sum.data.formatted, file=paste0(filename, ".ma"), quote = F, sep ="\t", col.names = T, row.names = F)
prefix="SBayesS_shunk"
mldm="$uqllloyd/sbayesr/pre_review/ukbEURu_hm3_sparse/ukbEURu_hm3_sparse_mldm_list.txt"
gwas="vitD_summerGWAS"
gwas="vitD_fastGWA_withX"
gwas="vitD_BMIcov_fastGWA"
gwas="vitD_winterGWAS"
tmp_command="gctb \
--sbayes S \
--mldm $mldm \
--gwas-summary ${gwas}.ma \
--pi 0.01 \
--hsq 0.5 \
--num-chains 4 \
--chain-length 25000 \
--burn-in 5000 \
--seed 12345 \
--thread 4 \
--no-mcmc-bin \
--out ${gwas}_SBayesS_withshrunkLM_nops "
qsubshcom "$tmp_command" 4 100G ${prefix}_${gwas}_nops 20:00:00 ""
SBayesR was carried out using GCTB.
prefix="SBayesR"
#ldm="$jzeng/proj/sbayes/ldm/ukb/n50k_hm3/ukbEURu_imp_v3_HM3_n50k.chisq10.ldm.sparse"
mldm="$uqllloyd/sbayesr/pre_review/ukbEURu_hm3_sparse/ukbEURu_hm3_sparse_mldm_list.txt"
gwas="vitD_fastGWA_withX"
gwas="vitD_BMIcov_fastGWA"
gwas="vitD_winterGWAS"
gwas="vitD_summerGWAS"
tmp_command="gctb \
--sbayes R \
--mldm $mldm \
--gwas-summary ../SBayesS/${gwas}.ma \
--pi 0.01 \
--hsq 0.5 \
--chain-length 25000 \
--burn-in 5000 \
--seed 12345 \
--thread 4 \
--no-mcmc-bin \
--out ${gwas}_SBayesR_shrunkLD_nops "
qsubshcom "$tmp_command" 4 50G ${prefix}_${gwas}_nops 20:00:00 ""
library(ggplot2)
library(ggpubr)
her.est = read.csv("input/heritatiliby_estimates_full_version_oct30.csv")
her.est = her.est[is.na(her.est$category) == F ,]
cat.her.est = nrow(her.est)
her.est$Method = (factor(her.est$Method, levels=her.est[seq(cat.her.est, 2, by = -2),"Method"]))
her.est$BMI.as.covariate = (factor(her.est$BMI.as.covariate, levels= c("yes", "no")))
her.est$var.source = factor(her.est$var.source, levels=c("Heritability estimates", "SNP-based heritability estimates", "GWAS SNP-based heritability estimates"))
heritability.plot <- ggplot(data=her.est, aes(x=Method, y=Heritability, fill=as.factor(BMI.as.covariate))) +
geom_bar(stat="identity", width=0.9, position = position_dodge(width=0.9)) +
coord_flip() +
labs(title="", x ="", y = expression(Estimate %+-% 1.96*SE)) +
facet_grid(var.source~., scales="free", space="free", switch="y", labeller = label_wrap_gen()) + theme_bw() +
theme(axis.text.x = element_text(angle = 0, hjust = 0, face = "bold"),
axis.text.y = element_text(face = "bold"),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
plot.title = element_text(hjust = 0.5),
legend.background = element_rect(fill="white", size=0.5, linetype="solid", color=NA),
legend.title = element_blank(),
legend.position = "top") +
theme(strip.text.y = element_text(face="bold", angle = 180), strip.placement = "outside", strip.background=element_rect(color = NA)) +
theme(axis.line = element_line(color="black")) +
labs(fill = "BMI as covariate") +
geom_errorbar(aes(ymin=Heritability-1.96*SE, ymax=Heritability+1.96*SE, colors="black"), width=.2, position=position_dodge(0.9))+
scale_fill_manual(values=c("grey", "light blue"), labels=c("BMI fitted","BMI not fitted")) + scale_colour_manual(values=c("grey", "light blue")) +
guides(col=F) +
ylim(0,0.4)
heritability.plot
## the table
heri.corr.table = read.csv("Rg_table.csv")
#heri.corr.table = her.est[is.na(her.est$Rg) ==F , c("category","Rg", "Rg.se")][1:4,]
#heri.corr.table[,"category"] = c("25OHD/BMI", "25OHD/BMI (summer)", "summer/winter", "summer/winter (BMI cov)")
colnames(heri.corr.table) = c(" ", "Method","genetic correlation", "s.e.")
heri.corr.table[,3:4] = round(heri.corr.table[,3:4] , digits = 2)
heri.corr <- ggtexttable(heri.corr.table, rows = NULL,
theme = ttheme("mOrange"))
heritability.sup.fig = ggarrange(heritability.plot, heri.corr,
ncol = 1, nrow = 2,
heights = c(1, 0.2), labels = c("A", "B"))
ggsave(heritability.sup.fig, file="heritability_plot_supFig2_1113.png", width = 8, height = 12)
## plot the number of SNPs
ldms = read.csv("AllLDMS_heritatiliby_estimates_melted_191111.csv")
ldms.snp = unique(ldms[ldms$Analysis=="5by2" | ldms$Analysis =="5by1", c("MAF", "SNP", "LD", "Analysis", "colorcode")])
ldms.snp$MAF = (factor(ldms.snp$MAF, levels=c("0.01-0.1", "0.1-0.2", "0.2-0.3", "0.3-0.4", "0.4-0.5", "0.01-0.5")))
ldms.snp$LD = (factor(ldms.snp$LD, levels=c("High LD", "Low LD", "All")))
p.n.snp<- ggplot(ldms.snp, aes(x=MAF, y=SNP, fill=LD)) +
geom_bar(stat="identity",
position=position_dodge()) +
labs(title="number of SNPs in each bin", x="MAF threshold", y = "number of SNPs")+
theme_classic() +
scale_fill_manual(values=c("light blue","#C3D7A4", "burlywood2")) + theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),
axis.text.y = element_text(face = "bold"),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
plot.title = element_text(hjust = 0.5),
#legend.background = element_rect(fill="white", size=0.5, linetype="solid", color="black"),
legend.position = "none" )
REML_VitD = ldms[ldms$Analysis=="5by2" | ldms$Analysis =="5by1",]
REML_VitD$MAF = (factor(REML_VitD$MAF, levels=c("0.01-0.1", "0.1-0.2", "0.2-0.3", "0.3-0.4", "0.4-0.5")))
REML_VitD$covBMI = (factor(REML_VitD$covBMI, levels=c("BMI not fitted", "BMI fitted")))
REML_VitD$LD = (factor(REML_VitD$LD, levels=c("High LD", "Low LD", "All")))
REML_VitD_plot_season <- REML_VitD %>% filter(season == "All_season") %>%
ggplot(aes(x=MAF, y=Variance, fill=as.factor(LD))) +
geom_bar(stat="identity", position = "dodge") +
labs(title="All seasons", x ="MAF threshold", y = expression(Variance %+-% 1.96*SE)) +
facet_grid(~covBMI) + theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),
axis.text.y = element_text(face = "bold"),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
plot.title = element_text(hjust = 0.5),
legend.background = element_rect(fill="white", size=0.5, linetype="solid", color="black"),
legend.position = c(0.2, 0.70)) +
labs(fill = "") +
geom_errorbar(aes(ymin=Variance-1.96*SE, ymax=Variance+1.96*SE), color="black", width=.2, position=position_dodge(1))+
scale_fill_manual(values=c("light blue","#C3D7A4", "burlywood2"))+
ylim(-0.01,0.09)
REML_VitD_plot_summer <- REML_VitD %>% filter(season == "Summer") %>%
ggplot(aes(x=MAF, y=Variance, fill=as.factor(LD))) +
geom_bar(stat="identity", position = "dodge") +
labs(title="Summer", x ="MAF threshold", y = expression(Variance %+-% 1.96*SE)) +
facet_grid(~covBMI) + theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),
axis.text.y = element_text(face = "bold"),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
plot.title = element_text(hjust = 0.5),
#legend.background = element_rect(fill="white", size=0.5, linetype="solid", color="black"),
legend.position = "none") +
labs(fill = "Variant MAF") +
geom_errorbar(aes(ymin=Variance- 1.96*SE, ymax=Variance+1.96*SE), color="black", width=.2, position=position_dodge(1))+
scale_fill_manual(values=c("light blue","#C3D7A4", "burlywood2")) +
ylim(-0.01,0.09)
LDMS_VitD_ABC <- ggarrange(REML_VitD_plot_season, REML_VitD_plot_summer, ggarrange(p.n.snp, nrow=1, ncol=2 , common.legend= F),ncol =1, nrow=3, common.legend = F, labels = c("A", "B", "C"))
ggsave("VitD_LDMS_ABC_nov12.png", LDMS_VitD_ABC, width = 10, height = 13, dpi = 300, units = "cm", scale=2.5)
Nearly-British samples are selected using PC1 and PC2.
## awk '{print $1, $2, $3, $4, $5, $6}' 1000G.ukb_proj.proj.eigenvec > PC1_PC2_projection_of_UKB.txt
pc.pro = data.frame(fread("PC1_PC2_projection_of_UKB.txt"))
names(pc.pro) = c( "FID", "IID" , "PC1", "PC2", "PC3", "PC4")
EUR_fam = data.frame(fread("ukbEUR_imp_chr1_v3_impQC.fam"))
EURu_fam = data.frame(fread("ukbEURu_imp_chr8_v3_impQC.fam"))
vitd.pheno = data.frame(fread("mlm_pheno"))
pc.pro$eur = (pc.pro$IID%in%EUR_fam$V2)
pc.pro$eur.u = (pc.pro$IID%in%EURu_fam$V2)
pc.pro$vitd = (pc.pro$IID%in%vitd.pheno$IID)
pc1.l = min(pc.pro[pc.pro$IID %in% EUR_fam$V2,"PC1"])
pc1.h = max(pc.pro[pc.pro$IID %in% EUR_fam$V2,"PC1"])
pc2.l = min(pc.pro[pc.pro$IID %in% EUR_fam$V2,"PC2"])
pc2.h = max(pc.pro[pc.pro$IID %in% EUR_fam$V2,"PC2"])
plot.PC12.eur = ggplot(data = pc.pro, aes(x = PC1, y = PC2, color = eur )) + geom_point(size = 0.1) +
geom_vline(xintercept = pc1.l, size = 0.2) +
geom_vline(xintercept = pc1.h, size = 0.2) +
geom_hline(yintercept = pc2.l, size = 0.2) +
geom_hline(yintercept = pc2.h, size = 0.2)
plot.PC12.eur
pc.pro[which(pc.pro$PC1 < 0.003 & pc.pro$eur==F), "eur"] = "extended"
sample.list2 = pc.pro[which(pc.pro$eur == "extended"), c("FID", "IID")]
ggarrange(
ggplot(data = pc.pro, aes(x = PC1, y = PC2, color = eur)) + geom_point(size = 0.1) ,
ggplot(data = pc.pro, aes(x = PC2, y = PC3, color = eur)) + geom_point(size = 0.1),
ggplot(data = pc.pro, aes(x = PC3, y = PC4, color = eur)) + geom_point(size = 0.1),
ggplot(data = pc.pro, aes(x = PC1, y = PC4, color = eur)) + geom_point(size = 0.1),
nrow = 2, ncol =2
)
write.table(sample.list2[,c(1:2)], file="extended.samples", quote = F, sep = "\t", col.names = F, row.names = F)
Then we select unrelated samples, which are unrelated between each other.
tmp_command="gcta64 \
--mbfile v3All_impQC_mbfiles.txt \
--keep extended.samples \
--extract ukbEUR_imp_allchr_v3_HM3_maf01_R9.prune.in \
--make-grm \
--out Extended_5402"
prefix=extended.only
qsubshcom "$tmp_command" 1 200G $prefix 20:00:00 ""
## find unrelated
gcta64 --grm Extended_5402 --grm-singleton 0.05 --out Unrelated_in_extended_5402
There are 3968 unrelated samples out of the 4517 nearly-British samples.
Then we select samples from them which are unrelated to the EUR UKB individuals.
sample.list2 = read.table("Unrelated_in_extended_5402.singleton.txt")
colnames(sample.list2) = c("FID", "IID")
sample.list1 = pc.pro[which(pc.pro$vitd == T), c("FID", "IID")]
sample.list2$group = "extended"
sample.list1$group = "used"
grm.sample.list.txt = rbind(sample.list1, sample.list2 )
grm.sample.list.txt$order = 1:nrow(grm.sample.list.txt)
write.table(grm.sample.list.txt[,c(1:2)], file="grm.sample.list.txt", quote = F, sep = "\t", col.names = F, row.names = F)
for (i in 1:40){
sample.list1.part = sample.list1[c(((i-1)*10000+1):(i*10000)),]
grm.sample.list = rbind(sample.list1.part, sample.list2)
write.table(grm.sample.list[,c(1:2)], file=paste0("sample_lists/grm.sample.list", i,".txt" ), quote = F, sep = "\t", col.names = F, row.names = F)
}
i=41
sample.list1.part41 = sample.list1[c(((i-1)*10000+1) : nrow(sample.list1)),]
grm.sample.list41 = rbind(sample.list1.part41, sample.list2)
write.table(grm.sample.list[,c(1:2)], file=paste0("sample_lists/grm.sample.list", i,".txt" ), quote = F, sep = "\t", col.names = F, row.names = F)
tmp_command="gcta64 \
--mbfile v3All_impQC_mbfiles.txt \
--make-grm \
--keep sample_lists/grm.sample.list{TASK_ID}.txt \
--extract ukbEUR_imp_allchr_v3_HM3_maf01_R9.prune.in \
--out Eur_unrelated_and_5402_extended_{TASK_ID}"
prefix=GRM1_41
qsubshcom "$tmp_command" 1 200G $prefix 20:00:00 "-array=1-41"
prefix=singleton
tmp_command="gcta64 --grm Eur_unrelated_and_5402_extended_{TASK_ID} --grm-singleton 0.05 --out Eur_and_3968_extended_{TASK_ID} "
qsubshcom "$tmp_command" 1 200G $prefix 20:00:00 "-array=1-41"
cat *family* > Eur_and_3968_extended_combined_families.txt
families = read.table("Eur_and_3968_extended_combined_families.txt")
sample.list2 = sample.list2[which(!sample.list2$IID%in%families$V2),]
sample.list2 = sample.list2[which(!sample.list2$IID%in%families$V4),]
sample.list2$PC1 = pc.pro[match(sample.list2$IID, pc.pro$IID),"PC1"]
write.table(sample.list2, "QIMR_equivalent_nearly_British_samples_2615.txt", quote = F, sep="\t", row.names = F, col.names = T)
Out of the 3968 unrelated samples, We get 2615 samples that are unrelated to the EUR samples.
Then we selected samples with phenotype available.
## get vitd phenotype
vitd.pheno.all = data.frame(fread("vD_pheno"))
sample.list2$vitD = vitd.pheno.all[ match(sample.list2$IID, vitd.pheno.all$IID),"vitaminD"]
keep.id =read.table("keep_ID.txt")
sample.list2 = sample.list2[which(sample.list2$IID%in%keep.id$V2),]
There are 2372 samples that have vitD phenotype in primary visit.
Then we selected 1632 samples from them to match the sample size in QIMR cohort.
pc.pro[which(pc.pro$PC1 < 0.003 & pc.pro$eur==F), "eur"] = "UKBR"
pc.pro$eur = gsub("TRUE" , "EUR", pc.pro$eur)
pc.pro$eur = gsub("FALSE" , "other ancestries", pc.pro$eur)
pc.pro$eur = factor(pc.pro$eur, levels = c("other ancestries", "UKBR", "EUR"))
ggarrange(
ggplot(data = pc.pro, aes(x = PC1, y = PC2, color = eur)) + geom_point(size = 0.1),
ggplot(data = pc.pro, aes(x = PC2, y = PC3, color = eur)) + geom_point(size = 0.1),
ggplot(data = pc.pro, aes(x = PC3, y = PC4, color = eur)) + geom_point(size = 0.1),
ggplot(data = pc.pro, aes(x = PC1, y = PC4, color = eur)) + geom_point(size = 0.1),
nrow = 2, ncol =2
)
pc.plot1 = ggplot(data = pc.pro, aes(x = PC1, y = PC2, color = eur)) + geom_point(size = 0.1)
ggsave(pc.plot1, file="PC1_PC2_plot_for_UKBR_figureF.png", height = 4, width = 6)
pc.plot1
We extracted the plink files for these samples and merged the chromosomes into one data.
awk '{print $2, $5, $8}' vitD_fastGWA_withX_SBayesR_shrunkLD_nops.snpRes > vitD_fastGWA_SBayesR_predictor.txt
awk '{print $2, $5, $8}' vitD_fastGWA_withX_SBayesS_withshrunkLM_nops.snpRes > vitD_fastGWA_SBayesS_predictor.txt
## extract
tmp_command="plink \
--bfile v3All_impQC/ukbAll_imp_chr{TASK_ID}_v3_impQC \
--extract vitD_fastGWA_SBayesR_SNPs.txt \
--keep nearly_British_samples_2615.txt \
--make-bed \
--out ukbAll_imp_chr{TASK_ID}_v3_impQC_HM3_selected "
prefix=extract
qsubshcom "$tmp_command" 1 100G $prefix 10:00:00 "-array=1-22"
## extract
tmp_command="plink \
--bfile v3All_impQC/ukbAll_imp_chr{TASK_ID}_v3_impQC \
--extract cojo_snps.txt \
--keep QIMR_equivalent_nearly_British_samples_2615.txt \
--make-bed \
--out ukbAll_imp_chr{TASK_ID}_v3_impQC_cojo_selected "
prefix=extract
qsubshcom "$tmp_command" 1 100G $prefix 10:00:00 "-array=1-3"
## merge
plink --bfile ukbAll_v3_impQC_cojo_selected_chr1 --merge-list files.to.merge.for.cojo.tx --make-bed --out ukbAll_v3_impQC_cojo_selected
plink --bfile ukbAll_imp_chr1_v3_impQC_selected --merge-list all.files.to.merge.txt --make-bed --out ukbAll_v3_impQC_selected
Then we profiled their vitD PRS using three predictors, which were generated using Cojo, SBayesR and SBayesS.
cojo = read.table("vitD_fastGWA_maf0.01_withX.cojo", header = T)
write.table(cojo[,c("SNP", "refA", "b")], "vitD_fastGWA_maf0.01_withX_cojo_predictor.txt", quote = F, sep ="\t", row.names = F, col.names = T)
write.table(cojo[,c("SNP")], "cojo_snps.txt", quote = F, sep ="\t", row.names = F, col.names = T)
hm3snp = read.table("vitD_fastGWA_SBayesR_SNPs.txt")
## profile
plink \
--bfile ukbAll_v3_impQC_selected \
--score vitD_fastGWA_SBayesR_predictor.txt \
--out ukbAll_v3_impQC_near_british_SBayesR
plink \
--bfile ukbAll_v3_impQC_selected \
--score vitD_fastGWA_SBayesS_predictor.txt \
--out ukbAll_v3_impQC_near_british_SBayesS
## use cojo snps
cojopred="vitD_fastGWA_maf0.01_withX_cojo_predictor.txt"
plink \
--bfile ukbAll_v3_impQC_cojo_selected \
--score $cojopred \
--out ukbAll_v3_impQC_near_british_Cojo
Here we checked the correlation of PRS and the real phenotype, to both raw data and normalized values.
rint.pheno = read.table("plink_pheno2", header = T)
profile.s = read.table("ukbAll_v3_impQC_near_british_SBayesS.profile", header = T)
profile.r = read.table("ukbAll_v3_impQC_near_british_SBayesR.profile", header = T)
profile.c = read.table("ukbAll_v3_impQC_near_british_Cojo.profile", header = T)
sample.list2$norm.vitd = rint.pheno[match(sample.list2$IID, rint.pheno$IID),"norm_vitD_res"]
sample.list2$profileS = profile.s[match(sample.list2$IID, profile.s$IID),"SCORE"]
sample.list2$profileR = profile.r[match(sample.list2$IID, profile.r$IID),"SCORE"]
sample.list2$profileC = profile.c[match(sample.list2$IID, profile.c$IID),"SCORE"]
corr.plot = ggarrange(
ggplot(data = sample.list2, aes(x = profileS, y = vitD )) + geom_point() + labs( title = paste0("correlation = ", round(cor(sample.list2$vitD, sample.list2$profileS), 2 ))),
ggplot(data = sample.list2, aes(x = profileR, y = vitD )) + geom_point() + labs( title = paste0("correlation = ", round(cor(sample.list2$vitD, sample.list2$profileR), 2))),
ggplot(data = sample.list2, aes(x = profileC, y = vitD )) + geom_point() + labs( title = paste0("correlation = ", round(cor(sample.list2$vitD, sample.list2$profileC), 2))),
nrow = 2, ncol = 2
)
corr.plot
# ggsave(corr.plot, file="Correlation_between_PRS_and_raw_vitD.png", height =8, width = 8)
corr.plot2 = ggarrange(
ggplot(data = sample.list2, aes(x = profileS, y = norm.vitd )) + geom_point() + labs( title = paste0("correlation = ", round(cor(sample.list2$norm.vitd, sample.list2$profileS), 2 ))),
ggplot(data = sample.list2, aes(x = profileR, y = norm.vitd )) + geom_point() + labs( title = paste0("correlation = ", round(cor(sample.list2$norm.vitd, sample.list2$profileR), 2))),
ggplot(data = sample.list2, aes(x = profileC, y = norm.vitd )) + geom_point() + labs( title = paste0("correlation = ", round(cor(sample.list2$norm.vitd, sample.list2$profileC), 2))),
nrow = 2, ncol = 2
)
corr.plot2
# ggsave(corr.plot2, file="Correlation_between_PRS_and_norm_vitD.png", height =8, width = 8)
We did clumping using the fastGWA GWAS result.
## in RCC
snps2exclude=UKB_EUR_impQC_duplicated_rsIDs
## $WD/results/fastGWA
snps2exclude=input/UKB_v3EUR_impQC/UKB_EUR_impQC_duplicated_rsIDs
ref=$medici/v3EUR_impQC/ukbEUR_imp_chr{TASK_ID}_v3_impQC
inFile=vitD_fastGWA_withX.ma
outFile=${inFile}_chr{TASK_ID}_clumped
prefix=clumping
# Run clumping
tmp_command="plink --bfile $ref \
--exclude $snps2exclude \
--clump $inFile \
--clump-p1 1 \
--clump-p2 1 \
--clump-r2 0.01 \
--clump-kb 5000 \
--clump-field "P" \
--out $outFile"
qsubshcom "$tmp_command" 1 80G $prefix.clump 48:00:00 "-array=1-22"
cat vitD_fastGWA_withX.ma_chr*_clumped.clumped | grep "CHR" | head -1 > ${inFile}.clumped
cat vitD_fastGWA_withX.ma_chr*_clumped.clumped | grep -v "CHR" |sed '/^$/d' >> ${inFile}.clumped
awk '{print $3}' ${inFile}.clumped | awk 'NR>1' > ${inFile}.clumped_SNPs.txt
awk '{print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11}' ${inFile}.clumped > ${inFile}.clumped_summary
We wrote the p value and allele effect of clumped SNPs into files.
sumori='vitD_fastGWA_withX.ma'
awk '{print $1, $7 }' ${sumori} > ${sumori}_pvalues
awk '{print $1, $2, $5 }' ${sumori} > ${sumori}_allele_effect
Then we profiled the PRS for our nearEUR samples using clumped SNPs with p.value threshold of 5E-08, 1E-05, 0.001, 0.01, 0.05, 0.1, 0.5 and 1.
pvalue=${sumori}_pvalues
clumped=${inFile}.clumped_SNPs.txt
beta=${sumori}_allele_effect
ukb=ukbAll_v3_impQC_selected
UKBR=keep_ID.txt
## extract
tmp_command="plink \
--bfile v3All_impQC/ukbAll_imp_chr{TASK_ID}_v3_impQC \
--extract ${inFile}.clumped_SNPs.txt \
--keep keep_ID.txt \
--make-bed \
--out PT_plink/ukbAll_imp_chr{TASK_ID}_v3_impQC_PT_selected "
prefix=extract
qsubshcom "$tmp_command" 1 100G $prefix 10:00:00 "-array=1-22"
## merge
plink --bfile ukbAll_imp_chr1_v3_impQC_PT_selected --merge-list all.files.to.merge.txt --make-bed --out ../ukbAll_imp_v3_impQC_PT_selected
## profile PRS
plink \
--bfile ukbAll_imp_v3_impQC_PT_selected \
--extract vitD_fastGWA_withX.ma.clumped_SNPs.txt \
--score vitD_fastGWA_withX.ma_allele_effect \
--q-score-file vitD_fastGWA_withX.ma_pvalues \
--q-score-range q.ranges \
--out UKBR_PT_PRS
First we combined PRS files into one table, and then we plotted them together with sevaral heritability estimates.
## merge profile
pt1 = read.table("UKBR_PT_PRS.S1.profile", header = T)
pt2 = read.table("UKBR_PT_PRS.S2.profile", header = T)
pt3 = read.table("UKBR_PT_PRS.S3.profile", header = T)
pt4 = read.table("UKBR_PT_PRS.S4.profile", header = T)
pt5 = read.table("UKBR_PT_PRS.S5.profile", header = T)
pt6 = read.table("UKBR_PT_PRS.S6.profile", header = T)
pt7 = read.table("UKBR_PT_PRS.S7.profile", header = T)
pt8 = read.table("UKBR_PT_PRS.S8.profile", header = T)
normFunc <- function(x){(x-mean(x, na.rm = T))/sd(x, na.rm = T)}
sample.list2$PT1 = normFunc(pt1[match(sample.list2$IID, pt1$IID),"SCORE"])
sample.list2$PT2 = normFunc(pt2[match(sample.list2$IID, pt2$IID),"SCORE"])
sample.list2$PT3 = normFunc(pt3[match(sample.list2$IID, pt3$IID),"SCORE"])
sample.list2$PT4 = normFunc(pt4[match(sample.list2$IID, pt4$IID),"SCORE"])
sample.list2$PT5 = normFunc(pt5[match(sample.list2$IID, pt5$IID),"SCORE"])
sample.list2$PT6 = normFunc(pt6[match(sample.list2$IID, pt6$IID),"SCORE"])
sample.list2$PT7 = normFunc(pt7[match(sample.list2$IID, pt7$IID),"SCORE"])
sample.list2$PT8 = normFunc(pt8[match(sample.list2$IID, pt8$IID),"SCORE"])
sample.list2$profileC = normFunc(sample.list2$profileC)
sample.list2$profileS = normFunc(sample.list2$profileS)
sample.list2$profileR = normFunc(sample.list2$profileR)
##
R2.table = data.frame(matrix(NA, ncol = 2, nrow = 11))
colnames(R2.table) = c("Threshold", "R2")
R2.table$Threshold = rev(c("SBayesS", "SBayesR", "COJO", "P<1", "P<0.5", "P<0.1", "P<0.05", "P<0.01", "P<0.001", "P<1E-05", "P<5E-08"))
R2.table$R2 = round(c(summary(lm(norm.vitd~PT8, sample.list2))$r.squared,
summary(lm(norm.vitd~PT7, sample.list2))$r.squared,
summary(lm(norm.vitd~PT6, sample.list2))$r.squared,
summary(lm(norm.vitd~PT5, sample.list2))$r.squared,
summary(lm(norm.vitd~PT4, sample.list2))$r.squared,
summary(lm(norm.vitd~PT3, sample.list2))$r.squared,
summary(lm(norm.vitd~PT2, sample.list2))$r.squared,
summary(lm(norm.vitd~PT1, sample.list2))$r.squared,
summary(lm(norm.vitd~profileC, sample.list2))$r.squared,
summary(lm(norm.vitd~profileR, sample.list2))$r.squared,
summary(lm(norm.vitd~profileS, sample.list2))$r.squared), 4)
R2.table$Pval = round(-log10(c( summary(lm(norm.vitd~PT8, sample.list2))$coefficients[2,4],
summary(lm(norm.vitd~PT7, sample.list2))$coefficients[2,4],
summary(lm(norm.vitd~PT6, sample.list2))$coefficients[2,4],
summary(lm(norm.vitd~PT5, sample.list2))$coefficients[2,4],
summary(lm(norm.vitd~PT4, sample.list2))$coefficients[2,4],
summary(lm(norm.vitd~PT3, sample.list2))$coefficients[2,4],
summary(lm(norm.vitd~PT2, sample.list2))$coefficients[2,4],
summary(lm(norm.vitd~PT1, sample.list2))$coefficients[2,4],
summary(lm(norm.vitd~profileC, sample.list2))$coefficients[2,4],
summary(lm(norm.vitd~profileR, sample.list2))$coefficients[2,4],
summary(lm(norm.vitd~profileS, sample.list2))$coefficients[2,4])))
R2.table$beta = round(c(summary(lm(norm.vitd~PT8, sample.list2))$coefficients[2,1],
summary(lm(norm.vitd~PT7, sample.list2))$coefficients[2,1],
summary(lm(norm.vitd~PT6, sample.list2))$coefficients[2,1],
summary(lm(norm.vitd~PT5, sample.list2))$coefficients[2,1],
summary(lm(norm.vitd~PT4, sample.list2))$coefficients[2,1],
summary(lm(norm.vitd~PT3, sample.list2))$coefficients[2,1],
summary(lm(norm.vitd~PT2, sample.list2))$coefficients[2,1],
summary(lm(norm.vitd~PT1, sample.list2))$coefficients[2,1],
summary(lm(norm.vitd~profileC, sample.list2))$coefficients[2,1],
summary(lm(norm.vitd~profileR, sample.list2))$coefficients[2,1],
summary(lm(norm.vitd~profileS, sample.list2))$coefficients[2,1]), digits = 2)
R2.table$s.e. = round(c(summary(lm(norm.vitd~PT8, sample.list2))$coefficients[2,2],
summary(lm(norm.vitd~PT7, sample.list2))$coefficients[2,2],
summary(lm(norm.vitd~PT6, sample.list2))$coefficients[2,2],
summary(lm(norm.vitd~PT5, sample.list2))$coefficients[2,2],
summary(lm(norm.vitd~PT4, sample.list2))$coefficients[2,2],
summary(lm(norm.vitd~PT3, sample.list2))$coefficients[2,2],
summary(lm(norm.vitd~PT2, sample.list2))$coefficients[2,2],
summary(lm(norm.vitd~PT1, sample.list2))$coefficients[2,2],
summary(lm(norm.vitd~profileC, sample.list2))$coefficients[2,2],
summary(lm(norm.vitd~profileR, sample.list2))$coefficients[2,2],
summary(lm(norm.vitd~profileS, sample.list2))$coefficients[2,2]), digits = 4 )
R2.table$cohort = "UKBR"
## add QIMR data
qimr.pre = read.csv("QIMR.prediction.csv")
qimr.pre$cohort = "QIMR"
qimr.pre$Threshold = R2.table$Threshold
qimr.pre$Pval = round(-log10(qimr.pre$Pval))
pred.R2 = rbind(R2.table, qimr.pre)
pred.R2$method = NA
pred.R2[grep("P", pred.R2$Threshold),"method"] = "PT"
pred.R2[grep("P", pred.R2$Threshold, invert = T),"method"] = as.character(pred.R2[(grep("P", pred.R2$Threshold, invert = T)),"Threshold"])
pred.R2$method = factor(pred.R2$method, levels = c("PT", "COJO", "SBayesR", "SBayesS"))
full.pred.R2 = pred.R2
pred.R2 = pred.R2[,c("Threshold", "R2", "Pval", "cohort", "method")]
## add heritability data
her.est = read.csv("heritatiliby_estimates_full_version_oct30.csv")
her.est = her.est[is.na(her.est$category) == F ,]
full.her.est = her.est
her.est = full.her.est[c(1, 9, 25, 15, 17), ]
her.est$annotation = c("Heritability", "GREML", "SBayes R", "summer", "winter")
her.est$cohort = "Heritability"
her.est$Pval = " "
her.est = her.est[,c("annotation", "Heritability", "Pval","var.source", "CatColor", "SE")]
colnames(her.est) = c(colnames(pred.R2),"SE")
her.est$analysis = c("Heritability", rep("SNP-based heritability", 4) )
## combine heritability and prediction data
pred.R2$SE = NA
pred.R2$analysis = "Out-of-sample prediction"
combined.data = rbind(her.est, pred.R2)
combined.data$Threshold = factor(combined.data$Threshold, levels = c("Heritability", "GREML", "SBayes R", "summer", "winter", rev(c("SBayesS", "SBayesR", "COJO", "P<1", "P<0.5", "P<0.1", "P<0.05", "P<0.01", "P<0.001", "P<1E-05", "P<5E-08"))))
combined.data$analysis = factor(combined.data$analysis , levels = c("Heritability", "SNP-based heritability", "Out-of-sample prediction"))
#write.csv(combined.data, file="combined.data.for.figure2.plot.csv")
## plot
colnames(combined.data)[2] = "Variance explained"
combined.data$second.layer = as.numeric(combined.data$method=="PT")
combined.data$second.layer = gsub(1 , "P+T, P-value thresholds applied to GWAS summary statistics" , combined.data$second.layer)
combined.data$second.layer = gsub(0 , " " , combined.data$second.layer)
combined.data$second.layer = factor(combined.data$second.layer, levels = c(" ", "P+T, P-value thresholds applied to GWAS summary statistics" ))
plot.part3 = ggplot(data=combined.data, aes(x=Threshold, y=`Variance explained`, fill =interaction(cohort, method), label = Pval) ) +
geom_bar(stat="identity", width=0.9, position = position_dodge(width=0.9) ) +
facet_grid(~analysis, scales="free", space="free", switch="y", labeller = label_wrap_gen()) +
geom_errorbar(aes(ymin=`Variance explained`-1.96*SE, ymax=`Variance explained`+1.96*SE, colors="black"), width=.2, position=position_dodge(0.9)) +
labs(title="", x ="", y = expression(`Variance explained` %+-% 1.96*SE)) +
ylim(0,0.35) + theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust=0.95,vjust=0.6, face = "bold"),
axis.text.y = element_text(face = "bold"),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
plot.title = element_text(hjust = 0.5),
legend.background = element_rect(fill="white", size=0.5, linetype="solid", color=NA),
legend.title = element_blank(),
legend.position = "none") +
geom_text(aes(label=Pval), angle=0, vjust = -0.2, hjust = 0.45 ,size=3.3, position = position_dodge(0.9), parse = T) +
scale_fill_manual(values=c( "indianred2" ,"olivedrab3",
"dark green", "light green",
"hotpink3", "light pink" ,
"goldenrod2",
"blue","light blue",
"plum4","plum2" ))
## add text
ann_text1 = data.frame(lab = "rg = 0.8, s.e. = 0.11",
`Variance explained` = 0.28,
analysis = factor("SNP-based heritability", levels = levels(combined.data$analysis)),
Threshold = "summer",
method=factor("SNP-based heritability estimates", levels = levels(combined.data$method) ),
cohort =factor("SNP-based heritability estimates",levels = levels(combined.data$cohort) ))
ann_text2 = data.frame(lab = "rg = 0.8, s.e. = 0.11",
`Variance explained` = 0.26,
analysis = factor("SNP-based heritability", levels = levels(combined.data$analysis)),
Threshold = "summer",
method=factor("SNP-based heritability estimates", levels = levels(combined.data$method) ),
cohort =factor("SNP-based heritability estimates",levels = levels(combined.data$cohort) ))
ann_text3 = data.frame(lab = "PRS_notes1",
`Variance explained` = 0.35,
analysis = factor("Out-of-sample prediction", levels = levels(combined.data$analysis)),
Threshold = "P<5E-08",
method=factor("PT", levels = levels(combined.data$method) ),
cohort =factor("QIMR",levels = levels(combined.data$cohort) ))
ann_text4 = data.frame(lab = "PRS_notes2",
`Variance explained` = 0.33,
analysis = factor("Out-of-sample prediction", levels = levels(combined.data$analysis)),
Threshold = "P<5E-08",
method=factor("PT", levels = levels(combined.data$method) ),
cohort =factor("QIMR",levels = levels(combined.data$cohort) ))
ann_text5 = data.frame(lab = "PRS_notes3",
`Variance explained` = 0.31,
analysis = factor("Out-of-sample prediction", levels = levels(combined.data$analysis)),
Threshold = "P<5E-08",
method=factor("PT", levels = levels(combined.data$method) ),
cohort =factor("QIMR",levels = levels(combined.data$cohort) ))
ann_text6 = data.frame(lab = "PT_notes",
`Variance explained` = -0.2,
analysis = factor("Out-of-sample prediction", levels = levels(combined.data$analysis)),
Threshold = "P<5E-08",
method=factor("PT", levels = levels(combined.data$method) ),
cohort =factor("QIMR",levels = levels(combined.data$cohort) ))
colnames(ann_text1)[2] = "Variance explained"
colnames(ann_text2)[2] = "Variance explained"
colnames(ann_text3)[2] = "Variance explained"
colnames(ann_text4)[2] = "Variance explained"
colnames(ann_text5)[2] = "Variance explained"
colnames(ann_text6)[2] = "Variance explained"
anno = data.frame(x1 = 3,
x2 = 4,
y1 = 0.24,
y2 = 0.25,
analysis = factor("SNP-based heritability", levels = levels(combined.data$analysis)),
Threshold = "summer",
method=factor("SNP-based heritability estimates", levels = levels(combined.data$method) ),
cohort =factor("SNP-based heritability estimates",levels = levels(combined.data$cohort) ),
Pval = " ")
anno.x = textGrob("P+T, P-value thresholds applied to GWAS summary statistics", gp = gpar(fontsize = 5))
annoted.plot.part3 = plot.part3 +
geom_text(data = ann_text1, parse = TRUE, label = "r[g] == 0.80", hjust = -0.1, vjust = 0, size = 5) +
geom_text(data = ann_text2, parse = TRUE, label = "s.e. == 0.11", hjust = 0.12, vjust = 0, size = 5) +
geom_text(data = ann_text3, parse = F, label = "Polygenic risk prediction using different SNP sets and SNP effect size estimates.", hjust = 0, vjust = 0, size = 3.5) +
geom_text(data = ann_text4, parse = F, label = "Dark coloured bars in each pair are for the UKB replication sample (N=1,632)", hjust = 0, vjust = 0, size = 3.5) +
geom_text(data = ann_text5, parse = F, label = "and light coloured bars are for the QIMR Australian replication sample (N=1,632).", hjust = 0, vjust = 0, size = 3.5) +
geom_text(data = ann_text6, parse = F, label = "P+T, P-value thresholds applied to GWAS summary statistics", hjust = 0, vjust = 0, size = 3.5) +
geom_segment(data = anno, aes(x = x1, xend = x1,
y = y1, yend = y2),
colour = "gray38") +
geom_segment(data = anno, aes(x = x2, xend = x2,
y = y1, yend = y2),
colour = "gray38") +
geom_segment(data = anno, aes(x = x1, xend = x2,
y = y2, yend = y2),
colour = "gray38") + coord_cartesian(clip = "off")
# annoted.plot.part3 + annotation_custom(anno.x,xmin=6,xmax=13,ymin=-0.1,ymax=-0.1) + coord_cartesian(clip = "off")
# annoted.plot.part3 + annotate(geom="text", x = 6, y = -0.1, label = "P+T, P-value thresholds applied to GWAS summary statistics", size =5)
# ggsave(annoted.plot.part3 , file="ggsaved.figure2.png", width = 10, height = 6)
# ggsave(annoted.plot.part3 , file="ggsaved.figure2_0228.pdf", width = 10, height = 6)