In this section, we ran summary-data–based Mendelian randomization (SMR) on the UKB vitamin D GWAS results (with no correction for BMI). The aim was to prioritize putative causal genes with expression underlying vitamin D levels. The associations identified with SMR (between gene expression and the trait of interest - in this case vitamin D) can result from one of three scenarios:
Zhu et al. 2016 proposed a method (the heterogeneity in dependent instruments [HEIDI] test) to detect cases where the association arises due to linkage - the scenario that is of less biological interest.
Here, we ran SMR + HEIDI to identify causal or pleiotropic associations between genes and vitamin D. First, we applied these methods to the largest eQTL dataset available to us (eQTLGen). Second, we validated our results using other eQTL data sets generated in relevant tissues.
Below we present results from these analyses.
We ran SMR with the following input parameters:
bfile: A reference dataset for LD estimation. We used unrelated individuals of European ancestry from the UKB.
gwas-summary: UKB GWAS summary statistics obtained with fastGWA (in .ma format)
extract-snp: We restricted these analyses to SNPs that were not in the MHC region (Chr6:26056121 – Chr6:33377699) and with MAF>0.01
beqtl-summary: eQTL summary-level data, i.e. association results between genetic variants and gene expression levels or methylation levels. Files are in BESD format, i.e. a file set containing the following three files: an .epi
file (expression probe information), an .esi
file (expression SNP information), and a .besd
file (binary with expression summary data). We used the following eQTL datasets:
All other settings were left as default:
# #######################
# Prepare list of SNPs to include in the analyses
# #######################
geno=$medici/UKBiobank/v3EUR_impQC/ukbEUR_imp_chr{TASK_ID}_v3_impQC
filters=maf0.01_noMHC
genoDir=$WD/input/UKB_v3EUR_impQC
snpList=$genoDir/UKB_EUR_impQC_$filters
# Get list of SNPs to include in the analysis
# Make file with MHC region to be excluded from analysis
echo "6 26056121 33377699 mhc" > $WD/input/mhc_range
# Extract common (MAF>0.01) variants that are not in MHC region
cd $genoDir
tmp_command="plink --bfile $geno \
--maf 0.01 \
--exclude range $WD/input/mhc_range \
--write-snplist \
--out ${snpList}_chr{TASK_ID}"
qsubshcom "$tmp_command" 1 30G filter.$filters 10:00:00 "-array=1-22"
cat ${snpList}_chr*.snplist > $snpList.snplist
rm UKB_EUR_impQC_maf0.01_noMHC_chr*
# #######################
# Prepare QTL datasets
# #######################
# Common settings
qtlDir=$WD/input/qtls
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
# ---- Create links to eQTL datasets
cd $qtlDir
# eQTLgen
ln -s $medici/uqtqi2/eQTLGen/rug_32k_cis* $qtlDir
for i in `ls $qtlDir/rug_32k_cis*`; do mv $i `echo $i | sed 's/rug_32k_cis/eQTLgen/g'`; done
# GTEx v7
ln -s $medici/GTExV7/Summary/besd/* $qtlDir
# PsychENCODE
#wget http://cnsgenomics.com/data/SMR/PsychENCODE_cis_eqtl_HCP100_summary.tar.gz
#tar -xzf PsychENCODE_cis_eqtl_HCP100_summary.tar.gz
ln -s $qtlDir/PsychENCODE_cis_eqtl_HCP100_summary/Gandal_PsychENCODE_eQTL_HCP100+gPCs20_QTLtools* $qtlDir
for i in `ls $qtlDir/Gandal_PsychENCODE_eQTL_HCP100+gPCs20_QTLtools*`; do mv $i `echo $i | sed 's/Gandal_PsychENCODE_eQTL_HCP100+gPCs20_QTLtools/PsychENCODE/g'`; done
# Fetal brain (O’Brien et al. 2018)
ln -s $medici/uqtqi2/smr_data/fetal/Brien_eQTL* $qtlDir
# ---- List QTL datasets to use
qtl_datasetList=$tmpDir/qtl_datasets_4SMR
echo "Brien_eQTL eQTLgen Liver PsychENCODE Brain_Amygdala Brain_Anterior_cingulate_cortex_BA24 Brain_Caudate_basal_ganglia Brain_Cerebellar_Hemisphere Brain_Cerebellum Brain_Cortex Brain_Frontal_Cortex_BA9 Brain_Hypothalamus Brain_Hippocampus Brain_Nucleus_accumbens_basal_ganglia Brain_Putamen_basal_ganglia Brain_Spinal_cord_cervical_c-1 Brain_Substantia_nigra Skin_Not_Sun_Exposed_Suprapubic Skin_Sun_Exposed_Lower_leg Whole_Blood" | tr ' ' '\n' > $qtl_datasetList
# #######################
# Run SMR
# #######################
# RCC settings
smr=$bin/smr/smr_Linux
nThreads=22
# Delta settings
smr=$bin/smr/smr_v0.706
nThreads=16
# Common settings
prefix=vitD_fastGWA_withX
qtlDir=$WD/input/qtls
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
ld_ref=$medici/UKBiobank/v3EUR_impQC/ukbEUR_imp_chr{TASK_ID}_v3_impQC
gwas=$results/maFormat/$prefix.ma
snps2include=$WD/input/UKB_v3EUR_impQC/UKB_EUR_impQC_maf0.01_noMHC.snplist
qtl_datasetList=$tmpDir/qtl_datasets_4SMR
output=$tmpDir/${prefix}_chr{TASK_ID}
# Run SMR
cd $results
for i in `cat $qtl_datasetList`
do
echo $i
qtl=$qtlDir/$i
tmp_command="$smr --bfile $ld_ref \
--gwas-summary $gwas \
--beqtl-summary $qtl \
--extract-snp $snps2include \
--thread-num $nThreads \
--out $output.$i"
qsubshcom "$tmp_command" $nThreads 100G $prefix.$i.smr 24:00:00 "-array=1-22 $account"
done
# Merge SMR results
for i in `cat $qtl_datasetList`
do
echo $i
sed 1q $tmpDir/${prefix}_chr1.$i.smr > $results/smr/$prefix.$i.smr
cat $tmpDir/${prefix}_chr*.$i.smr | grep -v 'Gene' >> $results/smr/$prefix.$i.smr
done
# #########################################
# Generate data for SMR locus plots
# #########################################
# Directories
results=$WD/results/fastGWA
tmpDir=$results/tmpDir
smrDir=$WD/results/fastGWA/smr
genoDir=$medici/UKBiobank/v3EUR_impQC
qtlDir=$WD/input/qtls
# Files and settings
prefix=vitD_fastGWA_withX
qtl_datasetList=$tmpDir/qtl_datasets_4SMR
gwas=$WD/results/fastGWA/maFormat/vitD_fastGWA_withX.ma
smr=$bin/smr/smr_v0.706
windowSize=6000
# Get list of gene locations for plotting
cd $smrDir
#wget https://www.cog-genomics.org/static/bin/plink/glist-hg19
# Loop through SMR datasets to get data to plot significant loci
for datasetName in `cat $qtl_datasetList`
do
dataset=$prefix.$datasetName.smr
# Generate file with significant results (the ones we want to plot)
n=`sed 1d $smrDir/$dataset | wc -l`
awk '$19<((0.05/'$n'))' $smrDir/$dataset > $smrDir/$dataset.smr_sig
# Retreive information for each probe
nProbes=`wc -l < $smrDir/$dataset.smr_sig `
if test $nProbes != 1; then arrays="-array=1-$nProbes"; i="{TASK_ID}"; else i=1; arrays=; fi
getProbe="probe=\$(awk -v i=$i 'NR==i {print \$1}' $smrDir/$dataset.smr_sig)"
getChr="chr=\$(awk -v i=$i 'NR==i {print \$2}' $smrDir/$dataset.smr_sig)"
runSMR="$smr --bfile $genoDir/ukbEUR_imp_chr\${chr}_v3_impQC \
--gwas-summary $gwas \
--beqtl-summary $qtlDir/$datasetName \
--plot \
--probe \$probe \
--probe-wind 6000 \
--gene-list $smrDir/glist-hg19 \
--out $smrDir/smrPlot.$datasetName.${windowSize}kb.chr\$chr"
qsubshcom "$getProbe; $getChr; $runSMR" 1 30G getSMRplot.${windowSize}kb.$dataset 24:00:00 "$arrays"
done
discovery = read.table("results/fastGWA/smr/vitD_fastGWA_withX.eQTLgen.smr", h=T, stringsAsFactors=F)
m=nrow(discovery)
pthrsh=0.05/m
discovery=discovery[discovery$p_SMR<pthrsh,]
The discovery dataset for eQTLs was eQTLgen.
# Significant SMR associations in eQTLgen
tmp=discovery[,c("probeID","ProbeChr","Probe_bp","Gene","topSNP","A1","b_SMR","se_SMR","p_SMR","p_HEIDI","b_GWAS","se_GWAS","p_GWAS","b_eQTL","se_eQTL","p_eQTL")]
#tmp=tmp[tmp$p_HEIDI>0.05,]
names(tmp)[grep("ProbeChr",names(tmp))]="Chr"
tmp=tmp[order(tmp$Chr,tmp$Probe_bp),]
# Format table
#options(warn=-1)
#tmp=tmp %>% mutate_at(vars(b_GWAS, se_GWAS, b_eQTL, se_eQTL, b_SMR, se_SMR), funs(round(., 2)))
#options(warn=0)
tmp$p_GWAS=format(tmp$p_GWAS, digits=3)
tmp$p_eQTL=format(tmp$p_eQTL, digits=3)
tmp$p_SMR=format(tmp$p_SMR, digits=3)
tmp$p_HEIDI=format(tmp$p_HEIDI, digits=3)
# Present table
datatable(tmp,
rownames=FALSE,
extensions=c('Buttons','FixedColumns'),
options=list(pageLength=10,
dom='frtipB',
buttons=c('csv', 'excel'),
scrollX=TRUE),
caption="List of associations that are significant in SMR analyses - eQTLgen") %>% formatStyle("p_eQTL","white-space"="nowrap") %>% formatStyle("p_SMR","white-space"="nowrap") %>% formatStyle("p_HEIDI","white-space"="nowrap")
res=fread("$path/vitD_fastGWA_withX",header=T)
res=as.data.frame(res)
smr=read.table("$path/smr/plot_setting/vitD_fastGWA_withX.eQTLgen.smr",header=T)
smr$window=NA
for (i in 1:nrow(smr)){
tmp=res[res$CHR==smr[i,6] & abs(res$POS-smr[i,7])<1e6,]
sig=tmp[tmp$P<1e-5,]
win=(max(sig$POS)-min(sig$POS))/1000
smr[i,"window"]=win
print(i)
print(win)
}
smr[smr$window<200,"window"]=200
write.table(smr,"smr_plot_setting_eQTLgen.txt",row.names=F,col.names=T,quote=F)
write.table(smr$probeID,"PROBE_ID_eQTLgen.txt",row.names=F,col.names=F,quote=F)
# ***********************
# SMR plot
# ***********************
source("$path/smr/plot_epiSMR_yang.r")
# pick up one probe for plotting
probe="ENSG00000137709"
## Read the setting file
set=read.table("$path/eQTLgen/smr_plot_setting.txt",header=T)
name=set[set$probeID==probe,"filename"]
chr=set[set$probeID==probe,"ProbeChr"]
plotfile=paste0("$path/eQTLgen/smrPlot.eQTLgen.6000kb.chr",chr,".",probe,".txt")
plotdata=ReadSMRData(plotfile);
## Set window size
window=set[set$probeID==probe,"window"]
## Set top eQTL
top=as.character(set[set$probeID==probe,"topSNP"])
## Set p_SMR threshold
smr_thresh=3.22e-06; smr_thresh_plot=3.22e-06;
png(paste0(name,".png"), width=15, height=18, units='in', bg="transparent", res=150,type = c("cairo"))
SMRLocusPlot(data=plotdata, probeNEARBY=NULL,smr_thresh, smr_thresh_plot, heidi_thresh=0.05, heidi_thresh_plot=0, plotWindow=window, pointsize=20, max_anno_probe=5, highlight=top, epi_plot=T, anno_self=T, plotBP=NULL, annoSig_only=F)
dev.off()
The plot below is an example of a locus identified with the SMR and HEIDI-threshold filtering methods. This example is presentedin Supplementary Figure 4 of the manuscript (probe ENSG00000137709).
From top to bottom, the first track shows -log10(P-value) of SNPs (grey dots) from the fastGWA GWAS of 25OHD. Each red rhombus indicates the -log10(P-value) from the SMR tests for associations of gene expression with 25OHD concentration. A solid rhombus represents a probe not rejected by the HEIDI filtering. The yellow rhombus denotes the SNP-25OHD association of the SNP that is the top cis-eQTL (rs2465654). The second track shows -log10(P-value) of the SNP association with gene expression probe ENSG00000137709 (tagging POU2F3). The third track displays information for 25 chromatin states (indicated by the colours on the right bar) of 127 samples from Roadmap Epigenomics Mapping Consortium (REMC) for different tissues and cell types (indicated by the colours on the left bar),which communicates potential tissue-specific roles for the SNP. The bottom track shows the genes underlying the genomic region.
We validated the results in the remaining eQTL data sets.
# Create list of data sets used
dataset_names=data.frame(tissue=c("Blood","Sun-exposed skin","Non-sun-exposed skin","Liver","Brain - Amygdala","Brain - Anterior cingulate cortex BA24","Brain - Caudate basal ganglia","Brain - Cerebellar hemisphere","Brain - Cerebellum","Brain - Cortex","Brain - Frontal cortex BA9","Brain - Hippocampus","Brain - Hypothalamus","Brain - Nucleus accumbens basal ganglia","Brain - Putamen basal ganglia","Brain - Spinal cord cervical c-1","Brain - Substantia nigra","Brain - Prefrontal cortex","Fetal brain"),
type=rep("eQTL",19),
prefix=c("Whole_Blood","Skin_Sun_Exposed_Lower_leg","Skin_Not_Sun_Exposed_Suprapubic","Liver","Brain_Amygdala","Brain_Anterior_cingulate_cortex_BA24","Brain_Caudate_basal_ganglia","Brain_Cerebellar_Hemisphere","Brain_Cerebellum","Brain_Cortex","Brain_Frontal_Cortex_BA9","Brain_Hippocampus","Brain_Hypothalamus","Brain_Nucleus_accumbens_basal_ganglia","Brain_Putamen_basal_ganglia","Brain_Spinal_cord_cervical_c-1","Brain_Substantia_nigra","PsychENCODE","Brien_eQTL"),
stringsAsFactors=F)
dataset_names$fileName=paste0("vitD_fastGWA_withX.",dataset_names$prefix,".smr")
# Merge results from all validation datasets
df=c()
datasets_available=list.files("results/fastGWA/smr/", pattern="vitD_fastGWA_withX")
datasets_available=grep("eQTLgen|caQTL", invert=T, datasets_available ,value=T) # Omit results from eQTLgen or caQTLs
for(i in datasets_available)
{
#print(grep(i,datasets_available))
# Load results for this dataset
tmp=read.table(paste0("results/fastGWA/smr/",i), h=T, stringsAsFactors=F, quote = "")
tmp=tmp[,c("probeID","ProbeChr","Probe_bp","Gene","topSNP","A1","b_SMR","se_SMR","p_SMR","p_HEIDI","b_GWAS","se_GWAS","p_GWAS","b_eQTL","se_eQTL","p_eQTL")]
names(tmp)[grep("ProbeChr",names(tmp))]="Chr"
# Restrict to SMR significant results
m=nrow(tmp)
tmp=tmp[tmp$p_SMR<0.05/m,]
# Add to table with results from all datasets
if(nrow(tmp)>0)
{
tmp$fileName=i
tmp$N_probes=m
tmp$bonf_P=0.05/m
df=rbind(df, tmp)
}
}
df=merge(df,dataset_names, all.x=T)
df$fileName=NULL
# Table with all SMR significant results
tmp=df[df$type=="eQTL",grep("prefix|type", names(df), invert=T)]
tmp=tmp[,c("tissue",grep("tissue", names(tmp), invert=T, value=T))]
datatable(tmp,
rownames=FALSE,
extensions=c('Buttons','FixedColumns'),
options=list(pageLength=10,
dom='frtipB',
buttons=c('csv', 'excel'),
scrollX=TRUE),
caption="List of associations that are significant in SMR analyses in other eQTL datasets") %>%
formatStyle("tissue","white-space"="nowrap") %>%
formatStyle("Gene","white-space"="nowrap") %>%
formatStyle("p_eQTL","white-space"="nowrap") %>%
formatStyle("p_SMR","white-space"="nowrap") %>%
formatStyle("p_HEIDI","white-space"="nowrap")