# Libraries
library(ggplot2)
library(dplyr)
library(ggrepel)
library(DT)
library(qqman)
library(knitr)
library(kableExtra)
We ran a classical genome-wide association (GWA) analysis with fastGWA. The
aim was to test the association between alleles and the mean of a
phenotype and identify quantitative trait loci (QLT), i.e. variants that
are associated with variability in the trait.
Below we describe our analyses and present the results.
This analysis was performed with fastGWA using variants with MAF>0.01. Given our sample size (N=417580), this is equivalent to restricting to SNPs seen at least 8351.6 times.
Chromosome X and its pseudo-autosomal region were analysed separately in males and females and were subsequently meta-analysed with METAL, using the inverse-variance based method. Only variants with MAF>0.01 in females were analysed.
The following files were used to run the analysis:
covariates
section of the
Pheno
page.To run the analysis faster, associations were performed by chromosome and ran in parallel. qsubshcom was used to submit the parallel jobs (array).
# Directories
genoDir=$medici/v3EUR_impQC
# #########################################
# Prepare files for analysis
# #########################################
# ------ Get sparse GRM
## Run from delta
cp $longda/UKB_geno/UKB_sparse_GRM/UKB_HM3_m01_sp.grm* $WD/input
# ------ Get list of SNPs to include in the analysis
# Extract common (MAF>0.01) variants
geno=$medici/v3EUR_impQC/ukbEUR_imp_chr{TASK_ID}_v3_impQC
filters=maf0.01
genoDir=$WD/input/UKB_v3EUR_impQC
snpList=$genoDir/UKB_EUR_impQC_$filters
cd $genoDir
tmp_command="plink --bfile $geno \
--write-snplist \
--maf 0.01 \
--out ${snpList}_chr{TASK_ID}"
qsubshcom "$tmp_command" 1 10G commonVariants 3:00:00 "-array=1-22"
cat ${snpList}_chr*.snplist > $snpList.snplist
rm ${snpList}_chr*
# ------ Make mbfile with all individual-level data files to include in the analysis
tmpDir=$WD/results/fastGWA/tmpDir
echo $genoDir/ukbEUR_imp_chr1_v3_impQC > $tmpDir/UKB_v3EUR_impQC.files
for i in `seq 2 22`; do echo $genoDir/ukbEUR_imp_chr${i}_v3_impQC >> $tmpDir/UKB_v3EUR_impQC.files; done
# #########################################
# Run GWAS with fastGWA - autosomes
# #########################################
# Set directories
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
# Files/settings to use
prefix=vitD_fastGWA
#prefix=vitD_BMIcov_fastGWA
geno=$tmpDir/UKB_v3EUR_impQC.files
pheno=$WD/input/mlm_pheno
grm=$WD/input/UKB_HM3_m01_sp
quantCovars=$WD/input/quantitativeCovariates
#quantCovars=$WD/input/quantitativeCovariates_bmi
catgCovars=$WD/input/categoricalCovariates
snps2include=$WD/input/UKB_v3EUR_impQC/UKB_EUR_impQC_maf0.01.snplist
output=$tmpDir/$prefix
# Run GWAS
cd $results
gcta=$bin/gcta/gcta_1.92.3beta3
tmp_command="$gcta --mbfile $geno \
--fastGWA-lmm \
--pheno $pheno \
--grm-sparse $grm \
--qcovar $quantCovars\
--covar $catgCovars \
--covar-maxlevel 110 \
--extract $snps2include \
--threads 16 \
--out $output"
qsubshcom "$tmp_command" 16 10G $prefix 24:00:00 ""
# Compress results
gzip -c $output.fastGWA > $results/$prefix.gz
# Convert GWAS sumstats to .ma format (for COJO)
echo "SNP A1 A2 freq b se p n" > $results/maFormat/$prefix.ma
awk '{print $2,$4,$5,$7,$8,$9,$10,$6}' $tmpDir/$prefix.fastGWA | sed 1d >> $results/maFormat/$prefix.ma
# #########################################
# Run GWAS with fastGWA - X chromosome
# #########################################
# Set directories
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
# Files/settings to use
pheno=$WD/input/mlm_pheno
grm=$WD/input/UKB_HM3_m01_sp
quantCovars=$WD/input/quantitativeCovariates
#quantCovars=$WD/input/quantitativeCovariates_bmi
catgCovars=$WD/input/categoricalCovariates_noSex
# ---- Run GWAS separately for males and females
cd $results
for sex in FEMALE MALE
do
# Run X chromosome pseudo-autosomal region (chr XY) as well
for region in X XY
do
prefix=vitD_fastGWA_chr$region.$sex
#prefix=vitD_BMIcov_fastGWA_chr$region.$sex
geno=$medici/v3EUR_chrX/ukbEURv3_${region}_maf0.1_$sex
output=$tmpDir/$prefix
gcta=$bin/gcta/gcta_1.92.3beta3
tmp_command="$gcta --bfile $geno \
--fastGWA-lmm \
--pheno $pheno \
--grm-sparse $grm \
--qcovar $quantCovars\
--covar $catgCovars \
--covar-maxlevel 110 \
--threads 16 \
--out $output"
qsubshcom "$tmp_command" 16 30G $prefix 6:00:00 ""
done
done
# #########################################
# Meta-analyse results from males and females for X and XY
# #########################################
cd $WD/results/fastGWA/tmpDir
metal=$bin/metal/metal
$metal
# === CHROMOSOME X ================================================
CLEAR
SCHEME STDERR
AVERAGEFREQ ON
# Describe and process results for X chromosome
MARKER SNP
ALLELE A1 A2
EFFECT BETA
PVALUE P
WEIGHT N
STDERR SE
FREQLABEL AF1
PROCESS $WD/results/fastGWA/tmpDir/vitD_fastGWA_chrX.FEMALE.fastGWA
PROCESS $WD/results/fastGWA/tmpDir/vitD_fastGWA_chrX.MALE.fastGWA
#PROCESS $WD/results/fastGWA/tmpDir/vitD_BMIcov_fastGWA_chrX.FEMALE.fastGWA
#PROCESS $WD/results/fastGWA/tmpDir/vitD_BMIcov_fastGWA_chrX.MALE.fastGWA
# Run meta-analysis
OUTFILE $WD/results/fastGWA/tmpDir/vitD_fastGWA_chrX .TBL
#OUTFILE $WD/results/fastGWA/tmpDir/vitD_BMIcov_fastGWA_chrX .TBL
ANALYZE
# === CHROMOSOME XY (X chromosome pseudo-autosomal region) ========
CLEAR
SCHEME STDERR
AVERAGEFREQ ON
# Describe and process results for XY chromosome
MARKER SNP
ALLELE A1 A2
EFFECT BETA
PVALUE P
WEIGHT N
STDERR SE
FREQLABEL AF1
PROCESS $WD/results/fastGWA/tmpDir/vitD_fastGWA_chrXY.FEMALE.fastGWA
PROCESS $WD/results/fastGWA/tmpDir/vitD_fastGWA_chrXY.MALE.fastGWA
#PROCESS $WD/results/fastGWA/tmpDir/vitD_BMIcov_fastGWA_chrXY.FEMALE.fastGWA
#PROCESS $WD/results/fastGWA/tmpDir/vitD_BMIcov_fastGWA_chrXY.MALE.fastGWA
# Run meta-analysis
OUTFILE $WD/results/fastGWA/tmpDir/vitD_fastGWA_chrXY .TBL
#OUTFILE $WD/results/fastGWA/tmpDir/vitD_BMIcov_fastGWA_chrXY .TBL
ANALYZE
# #########################################
# Format meta-analysed results and merge with autossomes
# Generate .ma format files for COJO
# #########################################
setwd("results/fastGWA")
prefix="vitD_fastGWA"
#prefix="vitD_BMIcov_fastGWA"
# Read X meta-analyses results + GWAS results for autosomes
xmeta=read.table(paste0("tmpDir/",prefix,"_chrX1.TBL"),h=T,stringsAsFactors=F)
xymeta=read.table(paste0("tmpDir/",prefix,"_chrXY1.TBL"),h=T,stringsAsFactors=F)
autosomes=read.table(paste0("tmpDir/",prefix,".fastGWA"), h=T,stringsAsFactors=F)
names(xmeta)=c("SNP","A1","A2","AF1","AF1_SE","BETA","SE","P","direction")
names(xymeta)=c("SNP","A1","A2","AF1","AF1_SE","BETA","SE","P","direction")
# Get chrX details (BP, allele freq, and N) from female and male GWAS
xfemale=read.table(paste0("tmpDir/",prefix,"_chrX.FEMALE.fastGWA"), h=T,stringsAsFactors=F)
xmale=read.table(paste0("tmpDir/",prefix,"_chrX.MALE.fastGWA"), h=T,stringsAsFactors=F)
## Get all results in the same order
xmeta=xmeta[match(xfemale$SNP,xmeta$SNP),]
xmale=xmale[match(xfemale$SNP,xmale$SNP),]
## Add BP
xmeta$POS=xfemale$POS
# Add N
xmeta$N=xfemale$N+xmale$N
# Change alleles to capital letters
xmeta$A1=toupper(xmeta$A1)
xmeta$A2=toupper(xmeta$A2)
# Define chromosome
xmeta$CHR=23
# Get chrXY details (BP, allele freq, and N) from female and male GWAS
xyfemale=read.table("tmpDir/vitD_fastGWA_chrXY.FEMALE.fastGWA", h=T,stringsAsFactors=F)
xymale=read.table("tmpDir/vitD_fastGWA_chrXY.MALE.fastGWA", h=T,stringsAsFactors=F)
## Get all results in the same order
xymeta=xymeta[match(xyfemale$SNP,xymeta$SNP),]
xymale=xymale[match(xyfemale$SNP,xymale$SNP),]
## Add BP
xymeta$POS=xyfemale$POS
## Add N
xymeta$N=xyfemale$N+xymale$N
# Define chromosome
xymeta$CHR=25
# Change alleles to capital letters
xymeta$A1=toupper(xymeta$A1)
xymeta$A2=toupper(xymeta$A2)
# fastGWA format
# CHR SNP POS A1 A2 N AF1 BETA SE P
xmeta_fastgwa=xmeta[,c("CHR","SNP","POS","A1","A2","N","AF1","BETA","SE","P")]
xymeta_fastgwa=xymeta[,c("CHR","SNP","POS","A1","A2","N","AF1","BETA","SE","P")]
## Merge X + XY + autosomes
allChr=rbind(autosomes, xmeta_fastgwa, xymeta_fastgwa)
# COJO (.ma) format
# SNP A1 A2 freq b se p n
## CHR X
xmeta_ma=xmeta[,c("SNP","A1","A2","AF1","BETA","SE","P","N")]
names(xmeta_ma)=c("SNP","A1","A2","freq","b","se","p","n")
## CHR XY
xymeta_ma=xymeta[,c("SNP","A1","A2","AF1","BETA","SE","P","N")]
names(xymeta_ma)=c("SNP","A1","A2","freq","b","se","p","n")
# Save new file formats
## All chromosomes
write.table(allChr, file=gzfile(paste0(prefix,"_withX.gz")), quote=F, row.names=F)
## X and XY in fastGWA format (for clumping)
write.table(xmeta_fastgwa, file=gzfile(paste0("tmpDir/",prefix,"_chrX.gz")), quote=F, row.names=F)
write.table(xymeta_fastgwa, file=gzfile(paste0("tmpDir/",prefix,"_chrXY.gz")), quote=F, row.names=F)
## X and XY in .ma format (for COJO)
write.table(xmeta_ma, paste0("maFormat/",prefix,"_chrX.ma"), quote=F, row.names=F)
write.table(xymeta_ma, paste0("maFormat/",prefix,"_chrXY.ma"), quote=F, row.names=F)
# #########################################
# Make .ma file with all chr (including X)
# #########################################
prefix=vitD_fastGWA
#prefix=vitD_BMIcov_fastGWA
results=$WD/results/fastGWA/maFormat
cp $results/$prefix.ma $results/${prefix}_withX.ma
sed 1d $results/${prefix}_chrX.ma >> $results/${prefix}_withX.ma
sed 1d $results/${prefix}_chrXY.ma >> $results/${prefix}_withX.ma
Results were then clumped with plink 1.9. The following options were used in this analysis:
SP2
# There are duplicated SNPs in the v3EUR_impQC files. This causes an error when running clumping using them as LD reference.
# Generate a file with their rs IDs and exclude them from analysis when clumping
for i in `seq 1 22`
do
echo $i
awk '{print $2}' $medici/v3EUR_impQC/ukbEUR_imp_chr${i}_v3_impQC.bim | sort | uniq -d >> $WD/input/UKB_v3EUR_impQC/UKB_EUR_impQC_duplicated_rsIDs
done
# #########################################
# Clump GWAS results - autossomes
# #########################################
# Directories
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
# Files/settings to use
prefix=vitD_fastGWA
#prefix=vitD_BMIcov_fastGWA
ref=$medici/v3EUR_impQC/ukbEUR_imp_chr{TASK_ID}_v3_impQC
inFile=$tmpDir/$prefix.fastGWA
outFile=$tmpDir/${prefix}_chr{TASK_ID}
snps2exclude=$WD/input/UKB_v3EUR_impQC/UKB_EUR_impQC_duplicated_rsIDs
window=10000
jobName=$prefix.clump
# Run clumping
cd $results
plink=$bin/plink/plink-1.9/plink
tmp_command="$plink --bfile $ref \
--exclude $snps2exclude \
--clump $inFile \
--clump-p1 5e-8 \
--clump-r2 0.01 \
--clump-kb $window \
--clump-field "P" \
--out $outFile"
qsubshcom "$tmp_command" 1 30G $jobName 24:00:00 "-array=1-22"
# #########################################
# Clump GWAS results - chromosome X
# #########################################
# Directories
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
# Run clumping
cd $results
for chr in X XY
do
# Files/settings to use
prefix=vitD_fastGWA_chr$chr
#prefix=vitD_BMIcov_fastGWA_chr$chr
ref=$medici/v3EUR_chrX/ukbEURv3_${chr}_maf0.1_FEMALE
inFile=$tmpDir/$prefix.gz
outFile=$tmpDir/$prefix
plink=$bin/plink/plink-1.9/plink
tmp_command="$plink --bfile $ref \
--clump $inFile \
--clump-p1 5e-8 \
--clump-r2 0.01 \
--clump-kb 10000 \
--clump-field "P" \
--out $outFile"
qsubshcom "$tmp_command" 1 10G $prefix.clump 10:00:00 ""
echo $prefix.clump
done
# #########################################
# Merge clumped results once all chr finished running
# #########################################
cat $tmpDir/${prefix}_chr*clumped | grep "CHR" | head -1 > $results/${prefix}_withX.clumped
cat $tmpDir/${prefix}_chr*clumped | grep -v "CHR" |sed '/^$/d' >> $results/${prefix}_withX.clumped
COJO – a summary-statistics-based method implemented in GCTA – is an approach to identify independent associations. It starts with the most associated SNP, then, conditioning on that SNP, it finds the second most associated SNP. Then, conditioning on both those SNPs, it finds the third most associated SNP, and so on.
The following options were used in this analysis:
# ########################
# Prepare files to run COJO
# ########################
# --- Extract a random set of 20K unrelated Europeans to use as LD reference for the autosomes
# In addition, keep only the variants in the vitamin D GWAS to speed up GSMR analysis
# Extract SNPs in fastGWA results
prefix=vitD_fastGWA
results=$WD/results/fastGWA
random20k_dir=$WD/input/UKB_v3EURu_impQC/random20K
awk 'NR>1{print $2}' $tmpDir/$prefix.fastGWA | > $random20k_dir/$prefix.snpList
# Extract data
ld_ref=$medici/v3EURu_impQC/ukbEURu_imp_chr{TASK_ID}_v3_impQC
ids2keep=$medici/v3Samples/ukbEURu_v3_all_20K.fam
snps2keep=$WD/input/UKB_v3EURu_impQC/random20K/$prefix.snpList
output=$random20k_dir/ukbEURu_imp_chr{TASK_ID}_v3_impQC_random20k_maf0.01
cd $random20k_dir
plink=$bin/plink/plink-1.9/plink
tmp_command="$plink --bfile $ld_ref \
--keep $ids2keep \
--extract $snps2keep \
--make-bed \
--out $output"
qsubshcom "$tmp_command" 1 10G ukbEURu_v3_impQC_random20k_maf0.01 24:00:00 "-array=1-22"
# #########################################
# Run COJO on GWAS results - autosomes
# #########################################
gcta=$bin/gcta/gcta_1.92.3beta3
# Common settings
prefix=vitD_fastGWA
#prefix=vitD_BMIcov_fastGWA
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
random20k_dir=$WD/input/UKB_v3EURu_impQC/random20K
ldRef=$random20k_dir/ukbEURu_imp_chr{TASK_ID}_v3_impQC_random20k_maf0.01
filters=maf0.01
window=10000
inFile=$results/maFormat/$prefix.ma
snps2include=$WD/input/UKB_v3EUR_impQC/UKB_EUR_impQC_$filters.snplist
outFile=$tmpDir/${prefix}_${filters}_chr{TASK_ID}
jobName=$prefix.COJO
# Run COJO
cd $results
tmp_command="$gcta --bfile $ldRef \
--cojo-slct \
--cojo-file $inFile \
--cojo-wind $window \
--extract $snps2include \
--out $outFile"
qsubshcom "$tmp_command" 1 100G $jobName 24:00:00 "-array=1-22 $account"
# #########################################
# Run COJO on GWAS results - X chromosome
# #########################################
# Common settings
prefix=vitD_fastGWA
#prefix=vitD_BMIcov_fastGWA
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
filters=maf0.01
# Run COJO
cd $results
gcta=$bin/gcta/gcta_1.92.3beta3
for chr in X XY
do
ref=$medici/v3EUR_chrX/ukbEURv3_${chr}_maf0.1_FEMALE
inFile=$results/maFormat/${prefix}_chr$chr.ma
outFile=$tmpDir/${prefix}_${filters}_chr$chr
tmp_command="$gcta --bfile $ref \
--cojo-slct \
--cojo-file $inFile \
--maf 0.01 \
--out $outFile"
qsubshcom "$tmp_command" 1 30G ${prefix}_chr$chr.COJO 24:00:00 "$account"
done
# #########################################
# Merge COJO results once all chr finished running
# #########################################
cat $tmpDir/${prefix}_${filters}*.jma.cojo | grep "Chr" | uniq > $results/${prefix}_${filters}_withX.cojo
cat $tmpDir/${prefix}_${filters}_chr*.jma.cojo | grep -v "Chr" |sed '/^$/d' >> $results/${prefix}_${filters}_withX.cojo
To assess the number of significant associations that were independent from previously reported variants, we performed an association analysis conditional on the 6 following SNPs:
Specifically, we used the --cojo-cond
option implemented
in GCTA for this analysis.
# ------ Condition GWAS results on known variants
# Delta settings
gcta=$bin/gcta/gcta_1.92.3beta3
# Common settings
prefix=vitD_fastGWA
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
random20k_dir=$WD/input/UKB_v3EURu_impQC/random20K
ldRef=$random20k_dir/ukbEURu_imp_chr{TASK_ID}_v3_impQC_random20k_maf0.01
filters=maf0.01
inFile=$results/maFormat/$prefix.ma
snps2include=$WD/input/UKB_v3EUR_impQC/UKB_EUR_impQC_$filters.snplist
outFile=$tmpDir/${prefix}_${filters}_conditionOnKnownLoci_chr{TASK_ID}
knownLoci=$WD/input/vitD_knownLoci.txt
# Generate list of known variants
loci2conditionOn=$tmpDir/knownLoci
awk 'NR>1 {print $1}' $knownLoci > $loci2conditionOn
# Condition GWAS results on known variants
cd $results
tmp_command="$gcta --bfile $ldRef \
--cojo-file $inFile \
--extract $snps2include \
--cojo-cond $loci2conditionOn \
--out $outFile"
qsubshcom "$tmp_command" 1 100G $prefix.COJO.knownLoci 24:00:00 "-array=1-22 $account"
# ------ Create file with all results after conditioning on known variants
## Run this step in R
knownLoci=read.table("input/vitD_knownLoci.txt", h=T, stringsAsFactors=F, sep="\t")
# Load all results and
gwas=read.table("results/fastGWA/vitD_fastGWA_withX.gz", h=T, stringsAsFactors=F)
# Keep SNP information from CHR that have been condition on known SNPs and remove results (we'll use the conditioned ones)
knownLoci_SNPinfo=gwas[gwas$CHR %in% knownLoci$CHR,c("SNP","A1","A2")]
gwas=gwas[!gwas$CHR %in% knownLoci$CHR,]
# Add conditioned results
for(chr in unique(knownLoci$CHR))
{
print(chr)
tmp=read.table(paste0("results/fastGWA/tmpDir/vitD_fastGWA_maf0.01_conditionOnKnownLoci_chr",chr,".cma.cojo"), h=T, stringsAsFactors=F)
# Add information not provided in COJO output (A2)
tmp=merge(tmp,knownLoci_SNPinfo, all.x=T)
if(!identical(tmp$A1,tmp$refA)){print("Check alleles")}
# Merge with unconditioned results
tmp=tmp[,c("Chr","SNP","bp","A1","A2","n","freq","bC","bC_se","pC")]
names(tmp)=c("CHR","SNP","POS","A1","A2","N","AF1","BETA","SE","P")
gwas=rbind(gwas,tmp)
}
# Save conditioned results
write.table(gwas, file=gzfile("results/fastGWA/vitD_fastGWA_withX_conditionOnKnownLoci.gz"), quote=F, row.names=F)
# Convert to .ma format (to run COJO) and save
## SNP A1 A2 freq b se p n
maFormat=gwas[,c("SNP","A1","A2","AF1","BETA","SE","P","N")]
names(maFormat)=c("SNP","A1","A2","freq","b","se","p","n")
write.table(maFormat, "results/fastGWA/maFormat/vitD_fastGWA_withX_conditionOnKnownLoci.ma", quote=F, row.names=F)
# ------ Run COJO on conditioned results
# Delta settings
gcta=$bin/gcta/gcta_1.92.3beta3
# Common settings
prefix=vitD_fastGWA_withX_conditionOnKnownLoci
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
random20k_dir=$WD/input/UKB_v3EURu_impQC/random20K
ldRef=$random20k_dir/ukbEURu_imp_chr{TASK_ID}_v3_impQC_random20k_maf0.01
filters=maf0.01
inFile=$results/maFormat/$prefix.ma
snps2include=$WD/input/UKB_v3EUR_impQC/UKB_EUR_impQC_$filters.snplist
outFile=$tmpDir/${prefix}_${filters}_chr{TASK_ID}
knownLoci=$WD/input/vitD_knownLoci.txt
cd $results
tmp_command="$gcta --bfile $ldRef \
--cojo-slct \
--cojo-file $inFile \
--extract $snps2include \
--out $outFile"
qsubshcom "$tmp_command" 1 30G $prefix.COJO 24:00:00 "-array=1-22 $account"
cat $tmpDir/${prefix}_${filters}*.jma.cojo | grep "Chr" | uniq > $results/${prefix}_${filters}.cojo
cat $tmpDir/${prefix}_${filters}_chr*.jma.cojo | grep -v "Chr" |sed '/^$/d' >> $results/${prefix}_${filters}.cojo
To run LDSC, we restricted the GWAS summary statistics to variants found in w_hm3.noMHC.snplist (downloaded from LDHub). This is a list of HapMap3 SNPs and excludes SNPs in the MHC region. Note that this does not include variants on the X chromosome.
module load ldsc
prefix=vitD_fastGWA
#prefix=vitD_BMIcov_fastGWA
# Directories
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
ldscDir=$bin/ldsc
# Format GWAS results
awk '{print $1,$2,$3,$5,$8,$7}' $results/maFormat/$prefix.ma > $tmpDir/$prefix.ldHubFormat
# Restrict to common SNPs
sed 1q $tmpDir/$prefix.ldHubFormat > $results/ldscFormat/$prefix.ldHubFormat
awk 'NR==FNR{a[$1];next} ($1 in a)' $WD/input/UKB_v3EURu_impQC/UKB_EURu_impQC_maf0.01.snplist <(awk 'NR==FNR{a[$1];next} ($1 in a)' $WD/input/w_hm3.noMHC.snplist $tmpDir/$prefix.ldHubFormat) >> $results/ldscFormat/$prefix.ldHubFormat
# Munge sumstats (process sumstats and restrict to HapMap3 SNPs)
cd $results/ldscFormat/
munge_sumstats.py --sumstats $results/ldscFormat/$prefix.ldHubFormat \
--merge-alleles $ldscDir/w_hm3.snplist \
--out $results/ldscFormat/$prefix
# Run LDSC
ldsc.py --h2 $results/ldscFormat/$prefix.sumstats.gz \
--ref-ld-chr $ldscDir/eur_w_ld_chr/ \
--w-ld-chr $ldscDir/eur_w_ld_chr/ \
--out $results/ldsc/$prefix.ldsc
Note: Unless otherwise specified, all results presented are for variants with MAF>0.01.
# Read in fastGWA results ---------------------------
#gwas=read.table("results/fastGWA/vitD_fastGWA_withX.gz", h=T, stringsAsFactors=F)
#saveRDS(gwas, file="results/fastGWA/vitD_fastGWA_withX.RDS")
gwas=readRDS("results/fastGWA/vitD_fastGWA_withX.RDS")
names(gwas)[grep("POS",names(gwas))]=c("BP")
cojo=read.table("results/fastGWA/vitD_fastGWA_maf0.01_withX.cojo", h=T, stringsAsFactors=F)
clump=read.table("results/fastGWA/vitD_fastGWA_withX.clumped", h=T, stringsAsFactors=F)
# Add cumulative position + cojo information for plotting
# ----- Add chromosome start site to the GWAS result
don=aggregate(BP~CHR,gwas,max)
don$tot=cumsum(as.numeric(don$BP))-don$BP
don=merge(gwas,don[,c("CHR","tot")])
# ----- Add a cumulative position to plot in this order
don=don[order(don$CHR,don$BP),]
# ----- Restrict SNPs with P<0.03 + a random samples of 100K SNPs (to plot faster)
don=don[don$P<0.03 | don$SNP %in% sample(gwas$SNP,100000) | don$SNP %in% cojo$SNP,]
don$BPcum=don$BP+don$tot
# ----- Add cojo information
don$is_cojo_sig=ifelse(don$SNP %in% cojo$SNP & don$P<1e-15, "yes","no")
don$is_cojo=ifelse(don$SNP %in% cojo$SNP, "yes","no")
# Prepare X axis
axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
# Restrict y-axis
don=don[(-log10(don$P)<150 & don$CHR<25) | don$is_cojo=="yes",]
## Keep top COJO SNPs
don$excluded=ifelse(-log10(don$P)>150, "yes","no")
don[-log10(don$P)>150,"P"]=min(don[-log10(don$P)<150,"P"])
axisdf = don %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
# Plot ---------------------------
#tiff("manuscript/submission2/Figures/figure3_manhattanPlot.tiff", width=8, height=3 ,units="in", res=600,compression="lzw", bg="transparent")
ggplot(don, aes(x=BPcum, y=-log10(P))) +
geom_point( aes(color=as.factor(CHR)), alpha=0.3, size=0.5) +
scale_color_manual(values = rep(c("lightsteelblue4", "lightsteelblue3"),44)) +
# # Highlighted SNPs of interest
geom_point(data=don[don$is_cojo=="yes" & don$excluded=="no",], color="red", size=2) +
geom_point(data=don[don$is_cojo=="yes" & don$excluded=="yes",], color="red", size=2, shape=17) +
# Edit axis
scale_x_continuous( label=axisdf$CHR, breaks=axisdf$center ) +
#scale_y_continuous(limits = c(0, 160)) +
labs(x="",
y=expression('-log'[10]*'(P)'))+
theme_bw() +
theme(legend.position="none",
axis.text=element_text(size=7),
#axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank())
#dev.off()
# Keep info to summarise in the report below
fastGWA_common=gwas
fastGWA_cojo_common=cojo
fastGWA_clump_common=clump
Red dots represent independent associations
identified with COJO.
Briefly, we have:
knownLoci=read.table("input/vitD_knownLoci.txt",h=T,stringsAsFactors=F, sep="\t")
knownLoci=merge(knownLoci, fastGWA_common[,c("SNP","BP")], all.x=T)
# ---------- Check number of new loci in our GWAS - based on COJO results
indep=read.table("results/fastGWA/vitD_fastGWA_withX_conditionOnKnownLoci.gz", h=T, stringsAsFactors=F)
indep_cojo=read.table("results/fastGWA/vitD_fastGWA_withX_conditionOnKnownLoci_maf0.01.cojo", h=T, stringsAsFactors=F)
# ---------- Present table of previously known variants
knownLoci=knownLoci[order(knownLoci$CHR, knownLoci$BP),c("SNP","CHR","BP","ref")]
tmp2=kable(knownLoci,
caption=paste0("List of previously reported GWS loci (N=",nrow(knownLoci),")"),
row.names=F)
kable_styling(tmp2, full_width=F)
SNP | CHR | BP | ref |
---|---|---|---|
rs2282679 | 4 | 72608383 | Ahn et al, Wang et al, Jiang et al |
rs10741657 | 11 | 14914878 | Ahn et al, Wang et al, Jiang et al |
rs12785878 | 11 | 71167449 | Ahn et al, Wang et al, Jiang et al |
rs10745742 | 12 | 96358529 | Jiang et al |
rs8018720 | 14 | 39556185 | Jiang et al |
rs6013897 | 20 | 52742479 | Wang et al, Jiang et al |
To assess the number of new associations in our study, we conditioned our results on the list of variants above.
After conditioning on those, we had:
# ################################################
# Present list of independent associations (identified with COJO)
# ################################################
indep = fastGWA_cojo_common %>% arrange(Chr, bp)
# Format table
indep=indep[,c("Chr","SNP","bp","refA","freq","b","se","p","n","freq_geno","bJ","bJ_se","pJ")]
indep$p=format(indep$p, digits=3)
indep$pJ=format(indep$pJ, digits=3)
indep=indep %>% mutate_at(vars(se, bJ_se, b, bJ), round, 4)
indep=indep %>% mutate_at(vars(freq, freq_geno), round, 2)
# Present table
datatable(indep,
rownames=FALSE,
extensions=c('Buttons','FixedColumns'),
options=list(pageLength=5,
dom='frtipB',
buttons=c('csv', 'excel'),
scrollX=TRUE),
caption="List of independent associations") %>%
formatStyle("p","white-space"="nowrap") %>%
formatStyle("pJ","white-space"="nowrap")
Columns are: Chr, chromosome; SNP,
SNP rs ID; bp, physical position;
refA, the effect allele; type, type of
QTL; freq, frequency of the effect allele in the
original data; b, se and
p, effect size, standard error and p-value from the
original GWAS; n, estimated effective sample size;
freq_geno, frequency of the effect allele in the
reference sample; bJ, bJ_se and
pJ, effect size, standard error and p-value from a
joint analysis of all the selected SNPs.
To run LDSC, we restricted the GWAS summary statistics to HapMap3 SNPs, excluding the MHC region and X chromosome. For more information see the Methods section above.
# Get sample for QQ plotting
gwas=fastGWA_common
pthrh=0.03
tmp=c(gwas[gwas$P<pthrh,"P"], sample(gwas[gwas$P>pthrh,"P"],100000))
# Plot
qq(tmp)
LDSC
tmp=read.table("results/fastGWA/ldsc/vitD_fastGWA.ldsc.log", skip=15, fill=T, h=F, colClasses="character", sep="_")[,1]
# Get results:
h2=strsplit(grep("Total Observed scale h2",tmp, value=T),":")[[1]][2]
lambda_gc=strsplit(grep("Lambda GC",tmp, value=T),":")[[1]][2]
chi2=strsplit(grep("Mean Chi",tmp, value=T),":")[[1]][2]
intercept=strsplit(grep("Intercept",tmp, value=T),":")[[1]][2]
ratio=strsplit(grep("Ratio",tmp, value=T),":")[[1]][2]
# #########################################
# Run SBayesR
# #########################################
# RCC settings
mldm=$llloyd/sbayesr/pre_review/ukbEURu_hm3_sparse/ukbEURu_hm3_sparse_mldm_list.txt
# Common settings
gctb=$bin/gctb/gctb_2.0_Linux/gctb
prefix=vitD_fastGWA
results=$WD/results/fastGWA
gwas=$results/maFormat/$prefix.ma
output=$results/$prefix.sbayesr
# Run SBayesR
cd $results
tmp_command="$gctb --sbayes R \
--gwas-summary $gwas \
--mldm $mldm \
--out $output"
qsubshcom "$tmp_command" 1 70G $prefix.sbayesr 60:00:00 "$account"
# Copy results to local computer (run command in local computer)
scp $rooID:$WD/results/fastGWA/vitD_fastGWA.sbayesr.parRes $local/results/fastGWA/
tmp=read.table("results/fastGWA/vitD_fastGWA.sbayesr.parRes",h=T,stringsAsFactors=F, skip=2)
h2=round(as.numeric(tmp["hsq",][1]),4)
se=round(as.numeric(tmp["hsq",][2]),4)