Phenotype data were downloaded from the UK Biobank Showcase following the instructions that were on the email sent to us on May 1st, 2019. Then, the files (encrypted dataset and key) were transferred to delta (IMB cluster) and data were decrypted following these instructions.

ukb_bridgingfile_12505_10214.txtwas used to match IDs from projects 12505 and 10214.



Variables extracted

To generate phenotype files to use in the GWA analyses, we extracted the following data fields:

*These variables were not available through this project. We extracted them from project 12505.

cd $WD
dataset=$WD/data/10214_29578_biochemistry.tab
outFile=$WD/input/vitaminD_rawData_10214

# Check which fields are available from project 10214
#sed 1q $dataset | sed 's/\t/\n/g' | awk '{print NR,$0}' | grep -E 'f.31\.|f.50\.|f.53\.|f.20116\.|f.21001\.|f.21003\.|f.22703\.|f.30890\.|f.20084\.|f.22000\.|f.104670' 

# Extract phenotypes
cut -f1-2,5-6,8-9,1471-1575,1736-1737,1758-1759,1762,1929-1930,2596-2600 $dataset | awk 'NR==1 {OFS="\t"; $1="IID"}1' > $outFile


# Copy extracted phenotypes to local computer (run commands locally)
cd $local/input
scp $deltaID:$WD/input/vitaminD_rawData_10214 .
scp $deltaID:$projects/ukb/results/vitD_lfData ./vitaminD_rawData_12505

Data description


# *****************************************
# Load data ----
# *****************************************
data=read.table("input/vitaminD_rawData_10214", h=T, stringsAsFactors=F)
df=melt(data, measure.vars=c("f.30890.0.0","f.30890.1.0"), variable.name="visit", value.name="vitaminD")
levels(df$visit)=c("v0","v1")
df=df[!is.na(df$vitaminD),]

# Read data from project 12505 and merge with the rest of the data
tmpPheno=read.table("input/vitaminD_rawData_12505", h=T, stringsAsFactors=F)
  ## Add coffee intake
  ci=read.table("input/coffee_lfData", h=T, stringsAsFactors=F)
  tmpPheno=rbind(tmpPheno,ci)
tmpPheno=dcast(tmpPheno, IID ~ fieldNumber, value.var="code")
bridgingIDs=read.table("data/ukb_bridgingfile_12505_10214.txt",h=T, stringsAsFactor=F, col.names=c("IID_12505","IID_10214"))
names(df)[grep("IID",names(df))]="IID_10214"
df=merge(df,bridgingIDs)
names(df)[grep("IID_12505",names(df))]="IID"
df$IID_10214=NULL
df=merge(df,tmpPheno, all.x=T)

# Principal Components (to save as R object - quicker to read in)
pcs=read.table("input/ukbEUR_PC40_HM3excludeRegions_v2.txt", h=T, stringsAsFactors=F)


# *****************************************
# Format variables and define measurements at date of blood test ----
# *****************************************
# Sex
names(df)[grep("f.31.0.0",names(df))]="sex"
df$sex=factor(df$sex, levels=c(0,1), labels=c("Female","Male"))

# Height
df$height=ifelse(df$visit=="v0", df$f.50.0.0, df$f.50.1.0)

# Assessment date
df$assessDate=ifelse(df$visit=="v0", df$f.53.0.0, df$f.53.1.0)

# Assessment centre
df$assessCentre=ifelse(df$visit=="v0", df$f.54.0.0, df$f.54.1.0)

# BMI
df$bmi=ifelse(df$visit=="v0", df$f.21001.0.0, df$f.21001.1.0)

# Age
df$age=ifelse(df$visit=="v0", df$f.21003.0.0, df$f.21003.1.0)

# Smoking status
df$smoke=ifelse(df$visit=="v0", df$f.20116.0.0, df$f.20116.1.0)

# Genotyping batch
names(df)[grep("f.22000.0.0",names(df))]="batch"

# Coffee intake
df$coffeeIntake=ifelse(df$visit=="v0", df$f.1498.0.0, df$f.1498.1.0)

# Vitamin D supplements
  # Isolate and reformat data from individuals who took supplements
  supplements=df[,grep("IID|f.20084",names(df))]
  supplements=melt(supplements, id.vars="IID")
  supplements=supplements[!is.na(supplements$value),]
  supplements$variable=as.character(supplements$variable)
  supplements$variable=substr(supplements$variable, 1, 9)
  # Define answers that confirm vitamin D supplements and answers that confirm other supplements (to distinguish from missing information)
  supplements$value=ifelse(supplements$value==479, "vitD", "other")
  supplements=supplements[!duplicated(supplements),]
  # If someone took vitamin D + other supplements, we group them with the vitamin D group (only interested in vitamin D - yes, no or NA)
  for(i in unique(supplements$variable))
  {
    tmp=supplements[supplements$variable==i,]
    repIDs=tmp[duplicated(tmp$IID),"IID"]
    supplements[supplements$variable==i & supplements$IID %in% repIDs & supplements$value=="other","value"]="vitD"
    # Check all is good
    if(table(tmp[tmp$IID %in% repIDs,"value"])[1]!=table(tmp[tmp$IID %in% repIDs,"value"])[2]){print("Check data - possible error")}
  }
  supplements=supplements[!duplicated(supplements),]
  supplements=dcast(supplements,IID~variable, value.var="value")
  df=df[,grep("f.20084.",names(df),invert=T)]
  df=merge(df,supplements, all.x=T)
  # Add information from individuals who were asked and said they had taken no supplements
  df$supplementUse=factor(ifelse(df$visit=="v0", df$f.104670.0.0, df$f.104670.4.0), levels=c(0,1), labels=c("no","yes"))
  df$supplementUseEver=unlist(apply(df[,grep("f.104670",names(df))], 1, function(x){ifelse(all(is.na(x)), 
                                                                                           NA, 
                                                                                           ifelse(sum(x,na.rm=T)>0, "yes", "no"))}))
  # Define indidivuals who had/didn't have vitamin D supplements the day before or ever
  df$supplements=ifelse(is.na(df$supplementUse), NA, ifelse(df$visit=="v0", df$f.20084.0, df$f.20084.4))
  df[is.na(df$supplements) & !is.na(df$supplementUse),"supplements"]="none"
  df$supplementsEver=unlist(apply(df[,grep("f.20084.",names(df))], 1, function(x){ifelse(all(is.na(x)), NA, 
                                                                                         ifelse(any(x=="vitD",na.rm = T), "vitD","other"))} ))
  df[!is.na(df$supplementUseEver) & df$supplementUseEver=="no","supplementsEver"]="none"
  
# Remove redundant variables
df=df[,grep("f.50.|f.53.|f.54.|f.1498|f.20116.|f.21001.|f.21003.|f.20084.|f.104670|supplementUse",names(df),invert=T)]

# Define season (winter: Dec-Apr | summer: Jun-Oct)
df$assessMonth=as.numeric(substr(df$assessDate,6,7))
df[df$assessMonth<=4 | df$assessMonth>=12,"season"]="winter"
df[df$assessMonth>=6  & df$assessMonth<=10,"season"]="summer"


# *****************************************
# Identify first blood test ----
# *****************************************
# Define which visit vitamin D levels were first assessed
tmp=dcast(df, IID~visit, value.var="vitaminD")
tmp$analyse=ifelse(!is.na(tmp$v0),"v0","v1")

# Identify data to analyse (i.e. when vitamin D levels were fisrt assessed)
df=merge(df,tmp[,c("IID","analyse")])
df$analyse=ifelse(df$analyse==df$visit, "yes","no")

# *****************************************
# Save tidy dataset ----
# *****************************************
if(saveResults){write.table(df, "input/vitaminD_tidyData", quote=F, row.names=F)}
if(saveResults){save(pcs, file="input/ukbEUR_PC40_HM3excludeRegions_v2.Rdata")}
df=read.table("input/vitaminD_tidyData", h=T, stringsAsFactors=F)

# Principal Componentss
load("input/ukbEUR_PC40_HM3excludeRegions_v2.RData")

Overview

df2=reshape2::dcast(df,IID~visit, value.var="vitaminD")

# Sample size with and without vitamin D levels assessed
tot=length(count.fields("input/vitaminD_rawData_10214",skip=1))
vitD=nrow(df2)
f=round(vitD/tot*100,2)

# Sample sizes by instance
n1=nrow(df[df$visit=="v0",])
n2=nrow(df[df$visit=="v1",])
n02=nrow(df2[is.na(df2$v0) & !is.na(df2$v1),])
n12=length(unique(df[duplicated(df$IID),"IID"]))

# Restricted to EUR
v3EUR_HM3_IDs=read.table("input/UKB_v3EUR_HM3/ukbEUR_imp_chr1_v3_HM3.fam", h=F, stringsAsFactors=F)
nEUR=nrow(v3EUR_HM3_IDs)
n1_EUR=nrow(df[df$visit=="v0" & df$IID %in% v3EUR_HM3_IDs$V1,])
n02_EUR=nrow(df2[is.na(df2$v0) & !is.na(df2$v1) & df2$IID %in% v3EUR_HM3_IDs$V1,])

# Make table of sample sizes
tmp=data.frame(assessmentTime=c("First assessment at v0","First assessment at v1","Total"),
               All=c(n1, n02, n1+n02),
               Europeans=c(n1_EUR, n02_EUR, n1_EUR+n02_EUR))
names(tmp)[1]=""
tmp=kable(tmp, row.names=F)

We have data for 502536 individuals. Of those, 449978 (89.54%) have vitamin D levels assessed:

  • 448376 individuals assessed on v0 (2006-2010)
  • 17039 individuals assessed on the v1 (2012-13)

15437 have vitamin D levels measured in both visits.

We restricted our analyses to the 1st vitamin D measurement that each individual has, and analysed individuals of European ancestry only (N=417580):

All Europeans
First assessment at v0 448376 416072
First assessment at v1 1602 1508
Total 449978 417580

Distribution

# Restrict analyses to 1st blood collection only (i.e. remove duplicate observations)
df=df[df$analyse=="yes",]
df$analyse=NULL

# Restrict to individuals of European ancestry
df=df[df$IID %in% v3EUR_HM3_IDs$V1,]

# Summary of distribution
se=function(x) {sqrt(var(x,na.rm=TRUE)/length(na.omit(x)))}

vitD=df$vitaminD
tmp=data.frame(t(unclass(summary(vitD))))
tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T)
tmp$N=nrow(df)
tmp2=kable(tmp, caption=paste0("Vitamin D (nmol/L)"), row.names=F)
kable_styling(tmp2, full_width=F)
Vitamin D (nmol/L)
Min. X1st.Qu. Median Mean X3rd.Qu. Max. se sd N
10 33.5 47.9 49.56348 63.2 340 0.0324702 20.9824 417580
# Density plot of vitamin D
ggplot(df, aes(x=vitaminD)) +
  geom_density(fill="lightcoral", color=alpha("grey", 0),alpha = 0.6) +
  labs(x="25 hydroxyvitamin D (nmol/L)")



We applied a rank-based inverse-normal transformation to vitamin D levels.

# *************************
# Normalise vitamin D -----
# *************************
norm = function(x){x[order(x,na.last=NA)]=qnorm(ppoints(length(x[!is.na(x)])),lower=T);x }
df$norm_vitD=norm(df$vitaminD)
df$log_vitD=log(df$vitaminD)
  # Save for all descriptive analayses below
  df$season=NULL
  if(saveResults){write.table(df, "input/vitaminD_tidyData_noMissing", quote=F, row.names=F)}

# Summary of normalised distribution
df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)
vitD=df$norm_vitD
tmp=data.frame(t(unclass(summary(vitD))))
tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T)
tmp$N=nrow(df)
tmp2=kable(tmp, caption=paste0("Vitamin D (RINT)"), row.names=F)
kable_styling(tmp2, full_width=F)
Vitamin D (RINT)
Min. X1st.Qu. Median Mean X3rd.Qu. Max. se sd N
-4.716892 -0.6744879 0 0 0.6744879 4.716892 0.0015475 0.9999996 417580
# Density plot of normalised vitamin D
ggplot(df, aes(x=norm_vitD)) +
  geom_density(fill="lightcoral", color=alpha("grey", 0),alpha = 0.6) +
  labs(x="25 hydroxyvitamin D  (RINT)")

# *************************
# Outliers (> 5 sd of mean) ----
# *************************
tmp=df$norm_vitD
cutoff=mean(tmp)+5*sd(tmp)
n_outliers=length(tmp[tmp>cutoff])

There are 0 after normalising vitamin D levels.



Distribution by assessment visit

Some individuals (N=1508) were assessed for the first time when the first repeat visits (v1) were taking place for individuals assessed at v0. Here we have the comparison of vitamin D levels for individuals assessed for the first time at v0 and for individuals assessed for the first time at v1.

df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)

# Summary of distribution at first assessment visit
vitD=df[df$visit=="v0","norm_vitD"]
tmp=data.frame(t(unclass(summary(vitD))))
tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T)
tmp$N=length(vitD)
tmp2=kable(tmp, caption=paste0("Vitamin D (normalised) at first assessment visit (N=",n1,")"), row.names=F)
kable_styling(tmp2, full_width=F)
Vitamin D (normalised) at first assessment visit (N=448376)
Min. X1st.Qu. Median Mean X3rd.Qu. Max. se sd N
-4.716892 -0.6738851 0.0003842 0.0003827 0.6746838 4.716892 0.0015498 0.9996845 416072
# Summary of distribution at first repeat assessment visit
vitD=df[df$visit=="v1","norm_vitD"]
tmp=data.frame(t(unclass(summary(vitD))))
tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T)
tmp$N=length(vitD)
tmp2=kable(tmp, caption=paste0("Vitamin D (normalised) at first repeat assessment visit (N=",n02,")"), row.names=F)
kable_styling(tmp2, full_width=F)
Vitamin D (normalised) at first repeat assessment visit (N=1602)
Min. X1st.Qu. Median Mean X3rd.Qu. Max. se sd N
-3.675827 -0.8365532 -0.1061536 -0.1055812 0.6212297 4.304144 0.0277759 1.078623 1508
# Density plot of normalised vitamin D by visit
ggplot(df, aes(x=norm_vitD)) +
  geom_density(aes(fill=visit), color="#e9ecef",alpha = 0.6) +
  scale_fill_manual(values=c("cadetblue","orange")) +
  scale_color_manual(values=c("cadetblue","orange")) +
  xlab("25 hydroxyvitamin D (normalised)")



Distribution by sex

df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)

# Summary in females
vitD=df[df$sex=="Female","norm_vitD"]
n=length(vitD)
tmp=data.frame(t(unclass(summary(vitD))))
tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T)
tmp2=kable(tmp, caption=paste0("Vitamin D in females (N=",n,")"), row.names=F)
kable_styling(tmp2, full_width=F)
Vitamin D in females (N=223454)
Min. X1st.Qu. Median Mean X3rd.Qu. Max. se sd
-4.716892 -0.676407 0.0002491 -0.0014229 0.6785364 4.304144 0.0021152 0.9998793
# Summary in males
vitD=df[df$sex=="Male","norm_vitD"]
n=length(vitD)
tmp=data.frame(t(unclass(summary(vitD))))
tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T)
tmp2=kable(tmp, caption=paste0("Vitamin D in males (N=",n,")"), row.names=F)
kable_styling(tmp2, full_width=F)
Vitamin D in males (N=194126)
Min. X1st.Qu. Median Mean X3rd.Qu. Max. se sd
-4.378065 -0.6722739 -0.0003362 0.0016378 0.6700577 4.716892 0.00227 1.000138
# Density plot by sex
ggplot(df, aes(x=norm_vitD)) +
  geom_density(aes(fill=sex), color="#e9ecef",alpha = 0.6) +
  scale_fill_manual(values=c("cadetblue","orange")) +
  scale_color_manual(values=c("cadetblue","orange")) +
  theme(legend.title = element_blank()) +
  xlab("25 hydroxyvitamin D  (normalised)")



Time series

Vitamin D in Year-Months

df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)

# *************************
# Average ----
# *************************
# Calculate average and 95% CI in each month
df$assessMonthYear=substr(df$assessDate,1,7)
df_YMmeans = do.call(data.frame,aggregate(norm_vitD ~ assessMonthYear+visit, df, function(x)c(mean=mean(x), sd=sd(x), N=length(x))))
names(df_YMmeans)[3:5] = c("norm_vitD","sd","N")
df_YMmeans$minCI = df_YMmeans$norm_vitD - 1.96*df_YMmeans$sd/sqrt(df_YMmeans$N)
df_YMmeans$maxCI = df_YMmeans$norm_vitD + 1.96*df_YMmeans$sd/sqrt(df_YMmeans$N)

# Sample size per month
tmp=data.frame(table(df$assessMonthYear, df$visit))
tmp=tmp[tmp$Freq!=0,]
names(tmp)=c("ym","visit","freq")

# Remove outliers
outliers=head(tmp[order(tmp$freq),])
df_YMmeans=df_YMmeans[!df_YMmeans$assessMonthYear %in% outliers[outliers$freq<30,"ym"],]
  
# Plot monthly average
ggplot(df_YMmeans, aes(x=assessMonthYear, y=norm_vitD, group=visit,color=visit)) +
  geom_line(size=1.1) +
  scale_color_manual(values=c("cadetblue","orange")) +
  scale_fill_manual(values=c("cadetblue","orange")) +
  geom_ribbon(data=df_YMmeans,aes(ymin=minCI,ymax=maxCI, fill=visit),alpha=0.3, color=NA)+
  geom_point(size=4, aes(shape=visit, fill=visit)) +
  scale_shape_manual(values = c(21, 24)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
        legend.title = element_blank(),
        legend.position = "bottom",
        plot.background = element_rect(fill="transparent",colour = NA)) +
  labs(title="Average by Year-Month",
       subtitle = "Shade represents the 95% CI of the mean", 
       y="Average 25 hydroxyvitamin D (RINT)", x=paste0("Month and Year of assessment"),
       caption="Note: Outlier Year-Month groups (N<30) were not included in the plot") +
  theme(plot.caption = element_text(hjust = 0))

# *************************
# Variance -----
# *************************
# Calculate variance and 95% CI
df_YMvars = do.call(data.frame,aggregate(norm_vitD ~ assessMonthYear+visit, df, function(x)c(var=var(x), N=length(x))))
names(df_YMvars)[3:4] = c("norm_vitD_s2","N")
chi2_0.025=qchisq(0.025,df_YMvars$N)
chi2_0.975=qchisq(0.975,df_YMvars$N)
df_YMvars$minCI = (df_YMvars$N-1)*df_YMvars$norm_vitD_s2/chi2_0.975
df_YMvars$maxCI = (df_YMvars$N-1)*df_YMvars$norm_vitD_s2/chi2_0.025

# Remove outliers
df_YMvars=df_YMvars[!df_YMvars$assessMonthYear %in% outliers[outliers$freq<30,"ym"],]

# Plot monthly variance
ggplot(df_YMvars, aes(x=assessMonthYear, y=norm_vitD_s2, group=visit,color=visit)) +
  geom_line(size=1.1) +
  scale_color_manual(values=c("cadetblue","orange")) +
  scale_fill_manual(values=c("cadetblue","orange")) +
  geom_ribbon(data=df_YMvars,aes(ymin=minCI,ymax=maxCI, fill=visit),alpha=0.3, color=NA)+
  geom_point(size=4, aes(shape=visit, fill=visit)) +
  scale_shape_manual(values = c(21, 24)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
        legend.title = element_blank(),
        legend.position = "bottom",
        plot.background = element_rect(fill="transparent",colour = NA)) +
  labs(title="Variance by Year-Month",
       subtitle="Shade represents 95% CI of the variance",
       y="Variance of RINT 25 hydroxyvitamin D", x=paste0("Month and Year of assessment"),
       caption="Note: Outlier Year-Month groups (N<30) were not included in the plot") +
  theme(plot.caption = element_text(hjust = 0))

On average:

  • 9456 individuals were assessed per month in the first visit (v0)
  • 137 individuals were assessed per month in the repeat visit (v1)

Outliers in the time-series

  • 2006-12 (visit v0) - N=2
  • 2010-10 (visit v0) - N=10
  • 2013-06 (visit v1) - N=29
  • 2012-12 (visit v1) - N=93
  • 2012-08 (visit v1) - N=106
  • 2013-03 (visit v1) - N=115



Vitamin D in each month (all years aggregated)

df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)

# *************************
# Average ----
# *************************
# Calculate average and 95% CI in each month
df_means = do.call(data.frame,aggregate(norm_vitD ~ assessMonth, df, function(x)c(mean=mean(x), sd=sd(x), N=length(x))))
names(df_means)[2:4] = c("norm_vitD","sd","N")
df_means$minCI = df_means$norm_vitD - 1.96*df_means$sd/sqrt(df_means$N)
df_means$maxCI = df_means$norm_vitD + 1.96*df_means$sd/sqrt(df_means$N)

# Plot monthly average
ggplot(data=df_means, aes(x=assessMonth, y=norm_vitD, group=1)) +
        geom_point(size=4, color="lightcoral") +
        geom_line(size=1.1, color="lightcoral") +
        geom_ribbon(data=df_means,aes(ymin=minCI,ymax=maxCI),alpha=0.3, color=NA, fill="lightcoral") +
        theme(legend.title = element_blank(),
              plot.background = element_rect(fill="transparent",colour = NA)) +
  scale_x_continuous(breaks=seq(1,12,1)) +
  labs(title="Overall monthly average",
       subtitle="Shade represents 95% CI of the mean",
       y="Average 25 hydroxyvitamin D (RINT)", x=paste0("Month of assessment")) 

# *************************
# Variance ----
# *************************
# Calculate variance and 95% CI in each month
df_vars = do.call(data.frame,aggregate(norm_vitD ~ assessMonth, df, function(x)c(var=var(x), N=length(x))))
names(df_vars)[2:3] = c("norm_vitD_s2","N")
chi2_0.025=qchisq(0.025,df_vars$N)
chi2_0.975=qchisq(0.975,df_vars$N)
df_vars$minCI = (df_vars$N-1)*df_vars$norm_vitD_s2/chi2_0.975
df_vars$maxCI = (df_vars$N-1)*df_vars$norm_vitD_s2/chi2_0.025


# Plot monthly average
ggplot(data=df_vars, aes(x=assessMonth, y=norm_vitD_s2, group=1)) +
  geom_point(size=4, color="lightcoral") +
  geom_line(size=1.1, color="lightcoral") +
  geom_ribbon(data=df_vars,aes(ymin=minCI,ymax=maxCI),alpha=0.3, color=NA, fill="lightcoral")+
        scale_shape_manual(values = c(21, 24)) +
        theme(legend.title = element_blank(),
              plot.background = element_rect(fill="transparent",colour = NA)) +
  scale_x_continuous(breaks=seq(1,12,1)) +
  labs(title="Overall monthly variance",
       subtitle="Shade represents 95% CI of the variance",
       y="Variance of RINT 25 hydroxyvitamin D", x=paste0("Month of assessment")) 

VitD supplements

Field 104670 indicates whether individuals replied to the survey on supplement intake.

For individuals who replied yes, field 20084 summaries which supplements they took. Individuals with code 479 in that question indicated that they had vitamin D supplements the day before the questionnaire.

Fields 104670 and 20084 were assessed at 5 instances:

  • instance 0: April 2009 to September 2010
    • instance 1: February 2011 to April 2011
    • instance 2: June 2011 to September 2011
    • instance 3: October 2011 to December 2011
    • instance 4: April 2012 to June 2012

To match these instances to the date at which vitamin D levels were measured, we need to map the questionnaire dates to the date of the vitamin D blood test. Vitamin D levels were measured at two instances:

  • instance 0: Initial assessment visit (2006-2010)
    • instance 1: First repeat assessment visit (2012-13)

We mapped instance 0 of the supplements to instance 0 of the blood test, and instance 4 of the supplements to instance 1 of the blood test.

Overview

tmp=data.frame(rbind(table(df$supplements,exclude = NULL),
                     table(df$supplementsEver, exclude = NULL)))
row.names(tmp)=c("24h before","Ever")
tmp=kable(tmp, caption="Number of individuals with/without supplements")
kable_styling(tmp, full_width=F)
Number of individuals with/without supplements
none other vitD NA.
24h before 34721 22200 1887 358772
Ever 90992 76863 11635 238090
#tmp=addmargins(table(df$supplements,df$supplementsEver,exclude = NULL))



Vitamin D in individuals with/without supplements 24h before

After mapping the supplement variables to the date at which vitamin D levels were assessed (see above), the following groups were defined:

  • NA: Individuals with no information on supplements taken 24h before
  • none: Individuals who reported not taking supplements 24h before
  • other: Individuals who reported taking other supplements (not vitamin D) 24h before
  • vitD: Individuals who reported taking vitamin D 24h before

In the summary table below, no-vitD summaries groups none and other together.

df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)

# Summary in individuals with no supplement information
vitD=df[is.na(df$supplements),"norm_vitD"]
n=length(vitD)
tmp=data.frame(t(unclass(summary(vitD))))
tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T); tmp$N=n
tmp$supplementGroup="NA"
df2=tmp[,c(ncol(tmp), ncol(tmp)-1,1:(ncol(tmp)-2))]

# Summary in individuals with supplement information
for(i in c("none","other","vitD"))
{
  vitD=df[!is.na(df$supplements) & df$supplements==i,"norm_vitD"]
  n=length(vitD)
  tmp=data.frame(t(unclass(summary(vitD))))
  tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T); tmp$N=n 
  tmp$supplementGroup=i
  df2=rbind(df2, tmp[,names(df2)])
}

# Summary in individuals with no vitD supplements (i.e. 'none'+'other')
vitD=df[!is.na(df$supplements) & df$supplements %in% c("none","other"),"norm_vitD"]
n=length(vitD)
tmp=data.frame(t(unclass(summary(vitD))))
tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T); tmp$N=n
tmp$supplementGroup="no-vitD"
df2=rbind(df2, tmp[,names(df2)])

# Summary table for all groups above
tmp2=kable(df2, caption=paste0("Vitamin D (normalised) by supplement group - 24h BEFORE"), row.names=F)
kable_styling(tmp2, full_width=F)
Vitamin D (normalised) by supplement group - 24h BEFORE
supplementGroup N Min. X1st.Qu. Median Mean X3rd.Qu. Max. se sd
NA 358772 -4.716892 -0.7074967 -0.0310572 -0.0288286 0.6479781 4.488155 0.0016761 1.0039725
none 34721 -4.202981 -0.6397710 0.0020139 0.0152729 0.6609527 4.716892 0.0051883 0.9667668
other 22200 -3.752877 -0.1913082 0.3878432 0.3876460 0.9818337 3.814424 0.0059959 0.8933733
vitD 1887 -2.879547 0.0775011 0.6243050 0.6395681 1.2095667 4.103408 0.0196160 0.8521109
no-vitD 56921 -4.202981 -0.4766515 0.1641562 0.1605037 0.8028807 4.716892 0.0040080 0.9562249
# Density plot by supplement group
ggplot(df, aes(x=norm_vitD)) +
  geom_density(aes(fill=supplements), color="#e9ecef",alpha = 0.6) +
  xlab("Vitamin D (normalised)")




Vitamin D in individuals with/without supplements at any time point

Supplement intake was assessed at 5 time-points. Using information from all time-points, the following groups were defined:

  • NA: Individuals with no information on supplements taken (ever)
  • none: Individuals who always reported not taking supplements
  • other: Individuals who always reported taking other supplements (not vitamin D)
  • vitD: Individuals who ever reported taking vitamin D (may also have taken other supplements)

As before, the group no-vitD was obtained by grouping individuals in the categories none and other.

df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)

# Summary in individuals with no supplement information
vitD=df[is.na(df$supplementsEver),"norm_vitD"]
n=length(vitD)
tmp=data.frame(t(unclass(summary(vitD))))
tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T); tmp$N=n
tmp$supplementGroup="NA"
df2=tmp[,c(ncol(tmp), ncol(tmp)-1,1:(ncol(tmp)-2))]

# Summary in individuals with supplement information
for(i in c("none","other","vitD"))
{
  vitD=df[!is.na(df$supplementsEver) & df$supplementsEver==i,"norm_vitD"]
  n=length(vitD)
  tmp=data.frame(t(unclass(summary(vitD))))
  tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T); tmp$N=n 
  tmp$supplementGroup=i
  df2=rbind(df2, tmp[,names(df2)])
}

# Summary in individuals with no vitD supplements (i.e. 'none'+'other')
vitD=df[!is.na(df$supplementsEver) & df$supplementsEver %in% c("none","other"),"norm_vitD"]
n=length(vitD)
tmp=data.frame(t(unclass(summary(vitD))))
tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T); tmp$N=n
tmp$supplementGroup="no-vitD"
df2=rbind(df2, tmp[,names(df2)])

# Summary in individuals with ever vitamin D vs. never/NA
df$supplementGroup=ifelse(!is.na(df$supplements) & df$supplements=="vitD", "ever", "never/NA")
vitD=df[df$supplementGroup=="never/NA","norm_vitD"]
n=length(vitD)
tmp=data.frame(t(unclass(summary(vitD))))
tmp$se=se(vitD); tmp$sd=sd(vitD,na.rm=T); tmp$N=n
tmp$supplementGroup="never/NA (none+other+NA)"
df2=rbind(df2, tmp[,names(df2)])


# Summary table for all groups above
tmp2=kable(df2, caption=paste0("Vitamin D (normalised) by supplement group - EVER"), row.names=F)
kable_styling(tmp2, full_width=F)
Vitamin D (normalised) by supplement group - EVER
supplementGroup N Min. X1st.Qu. Median Mean X3rd.Qu. Max. se sd
NA 238090 -4.716892 -0.7236918 -0.0414398 -0.0407201 0.6418865 4.378065 0.0020718 1.0109343
none 90992 -4.304144 -0.7914820 -0.1441555 -0.1211890 0.5305524 4.716892 0.0032528 0.9812146
other 76863 -3.947332 -0.3846385 0.2374143 0.2283545 0.8575268 4.248166 0.0034084 0.9449495
vitD 11635 -3.733036 -0.3782312 0.3038705 0.2724769 0.9492897 4.132261 0.0091830 0.9905259
no-vitD 167855 -4.304144 -0.6208657 0.0357630 0.0388716 0.6970480 4.716892 0.0023929 0.9803668
never/NA (none+other+NA) 415693 -4.716892 -0.6772090 -0.0031484 -0.0029033 0.6712566 4.716892 0.0015505 0.9996895
# Density plot by supplement group
ggplot(df, aes(x=norm_vitD)) +
  geom_density(aes(fill=supplementsEver), color="#e9ecef",alpha = 0.6) +
  xlab("Vitamin D (normalised)")

In the plot below, the following groups were used:

  • ever: vitamin D reported at any of the 5 time points (may also have taken other supplements). Equivalent to vitD above.
  • never/NA: individuals who (1) never reported taking vitamin D (may have reported other supplements or not taking supplements ever) or (2) never replied to the supplement questionnaire. This is equivalent to other+none+NA.
# Density plot by supplement group: ever vs. never/NA
ggplot(df, aes(x=norm_vitD)) +
  geom_density(aes(fill=supplementGroup), color="#e9ecef",alpha = 0.6) +
  xlab("Vitamin D (normalised)")

df$supplementGroup=NULL




Correlation between normalised vitamin D supplements and vitamin D levels

df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)

# Boxplot of vitamin D levels against supplemnet intake
df2=df[!is.na(df$supplements),]
ggplot(df2, aes(x=supplements, y=norm_vitD, fill=supplements)) +
  geom_boxplot( alpha=0.6, na.rm=T) +
  scale_fill_discrete(name="Supplements 24h before") +
  labs(x="", y="Vitamin D (normalised)") 

# Boxplot of vitamin D levels against ever having taken supplemnets
df2=df[!is.na(df$supplementsEver),]
ggplot(df2, aes(x=supplementsEver, y=norm_vitD, fill=supplementsEver)) +
  geom_boxplot( alpha=0.6) +
  scale_fill_discrete(name="Supplements at any point asked") +
  labs(x="", y="Vitamin D (normalised)") 

# Test difference in vitamin D levels between individuals taking supplements and individuals not taking them
model1=summary(lm(norm_vitD~supplements, df2))
model2=summary(lm(norm_vitD~supplementsEver, df2))
tmp1=data.frame(model1$coefficients)[-1,]
tmp2=data.frame(model2$coefficients)[-1,]
row.names(tmp1)=gsub("supplements","",row.names(tmp1))
row.names(tmp2)=gsub("supplements","",row.names(tmp1))
kable_styling(kable(tmp1, digits=c(2,2,2,1e300), caption="Comparison of norm-vitD against individuals with no supplements 24h before"), full_width=F)
Comparison of norm-vitD against individuals with no supplements 24h before
Estimate Std..Error t.value Pr…t..
other 0.37 0.01 46.29 0.00000e+00
vitD 0.62 0.02 28.21 6.09638e-174
kable_styling(kable(tmp2, digits=c(2,2,2,1e300),caption="Comparison of norm-vitD against individuals with no supplements in any of the 5 instances"), full_width=F)
Comparison of norm-vitD against individuals with no supplements in any of the 5 instances
Estimate Std..Error t.value Pr…t..
other 0.35 0.00 73.83 0
vitD 0.39 0.01 41.37 0
# Test difference in vitamin D levels between individuals who took vitamin D supplements and those who took other supplements
model1=summary(lm(norm_vitD~supplements, df2[!is.na(df2$supplements) & df2$supplements!="none",]))
model2=summary(lm(norm_vitD~supplementsEver, df2[!is.na(df2$supplementsEver) & df2$supplementsEver!="none",]))
tmp1=data.frame(model1$coefficients)[-1,]
tmp2=data.frame(model2$coefficients)[-1,]
row.names(tmp1)=gsub("supplements","",row.names(tmp1))
row.names(tmp2)=gsub("supplements","",row.names(tmp1))
kable_styling(kable(tmp1, digits=c(2,2,2,1e300), caption="Comparison of norm-vitD against individuals with no vit D supplements 24h before"), full_width=F)
Comparison of norm-vitD against individuals with no vit D supplements 24h before
Estimate Std..Error t.value Pr…t..
vitD 0.25 0.02 11.8 4.688554e-32
kable_styling(kable(tmp2, digits=c(2,2,2,1e300),caption="Comparison of norm-vitD against individuals with no vit D supplements in any of the 5 instances"), full_width=F)
Comparison of norm-vitD against individuals with no vit D supplements in any of the 5 instances
Estimate Std..Error t.value Pr…t..
vitD 0.04 0.01 4.66 3.11149e-06

Covariates

Prepare file with covariates to include in MLM GWAS models

We fit the following variables in the mixed linear model (MLM) analyses:

  • Age
  • Sex
  • Assessment month
  • Assessment centre
  • First 40 PCs: Principal components (PCs) were included in the model to account for population stratification. In our analyses, we used PCs (Q0286/UKBiobank/v2Samples/ukbEUR_PC40_HM3excludeRegions_v2.txt) that Kathryn Kemper generated with FlashPCA, using ~500K HapMap3 SNPs and excluding regions of long-range LD (LD<0.9).
  • Genotyping batch: This is relevant because the first batch of individuals were ascertained for a COPD phenotype.
  • Supplement intake

Justification for correcting for assessment month: Our original plan was to correct the analyses for assessment month-year. However, when we tried fitting that in the fastGWA model, the covariate matrix could not be inverted. We tried removing the month-year groups with small sample size (e.g. 12-2006 - N=2), but the problem remained until we excluded all month-year groups with N<400 (i.e. excluding 2186 from analysis). This cut-off may not be easy to justify when we publish the results, so we ran the analyses fitting assessment month and compared the results with those obtained without the year-month groups with N<400. Results were highly correlated so we ran all follow-up analyses on the results obtained with correction for assessment month. Note that this is also the correction that was done in the SUNLIGHT consortium study.

See below the results from association tests with different combinations of covariates.

df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)
#load("input/ukbEUR_PC40_HM3excludeRegions_v2.RData")

# ---- Categorical confounders
df$FID=df$IID
catCov=df[,c("FID","IID","sex","batch","assessCentre","assessMonth","supplementsEver")]
catCov[is.na(catCov$supplementsEver),"supplementsEver"]="NoInfo"
  # Remove levels with low sample size (N<400)
  #tmp=table(catCov$assessMonthYear)
  #catCov=catCov[!catCov$assessMonthYear %in% names(tmp[tmp<400]),]

  # Create file with no sex for X chromosome anlayses (which are run individually in males and females)
  catCov_noSex=catCov[,grep("sex",names(catCov), invert=T)]

# ---- Quantitative confounders
quantCov=df[,c("FID","IID","age")]
quantCov=merge(quantCov, pcs[,2:42], all.x=T)
names(quantCov)[1:2]=c("FID","IID")

  # Create a file with BMI as well
  quantCov_bmi=df[,c("FID","IID","age","bmi")]
  quantCov_bmi=merge(quantCov_bmi, pcs[,2:42], all.x=T)
  names(quantCov_bmi)[1:2]=c("FID","IID")


# ---- Save files 
if(saveResults){write.table(catCov, "input/categoricalCovariates", quote=F, row.names=F)}
if(saveResults){write.table(catCov_noSex, "input/categoricalCovariates_noSex", quote=F, row.names=F)}
if(saveResults){write.table(quantCov, "input/quantitativeCovariates", quote=F, row.names=F)}
if(saveResults){write.table(quantCov_bmi, "input/quantitativeCovariates_bmi", quote=F, row.names=F)}
scp $local/input/*Covariates $deltaID:$WD/input



Association tests with covariates

In this section we tested the association between potential confounder variables and normalised vitamin D levels. All variants tested were obtained from the UK Biobank (see above the fields they were extracted from), except principal components (PCs).

df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)
df$assessMonthYear=substr(df$assessDate,1,7)
#load("input/ukbEUR_PC40_HM3excludeRegions_v2.RData")

# Create file with phenotype and covariates to analyse
df2=df[,c("IID","vitaminD","norm_vitD","log_vitD","sex","age","bmi","batch","assessCentre","assessMonth","assessMonthYear","supplementsEver")]
df2=merge(df2, pcs[,2:12], all.x=T)
  catVars=c("assessCentre","assessMonth","assessMonthYear","batch","supplementsEver")
  df2[,catVars] = lapply(df2[,catVars], factor)

# Create a table to keep results from all association tests
out=data.frame(covariate=names(df2)[-c(1:4)],
               test="lm",beta=NA, r2=NA, p=NA, stringsAsFactors=F)
out[out$covariate %in% catVars,"test"]="anova"
out2=c()
for(iv in c("vitaminD","norm_vitD","log_vitD"))
{
  tmp=out
  tmp$IV=iv
  out2=rbind(out2,tmp)
}
out=out2

# Run associations
for (i in 1:nrow(out))
{
  iv=out[i,"IV"]
  c=out[i,"covariate"]
  
  if(!c %in% catVars)
  {
    model=lm(df2[,iv]~df2[,c])
    r2=summary(model)$r.squared
    coef=summary(model)$coefficients
    beta=round(coef[2,1],3)
    p=coef[2,4]
  }
  
  if(c %in% catVars)
  {
    model=lm(df2[,iv]~df2[,c])
    r2=summary(model)$r.squared
    coef=summary(aov(df2[,iv]~df2[,c]))
    beta=NA
    p=coef[[1]][["Pr(>F)"]][1]
  }
  
  out[i,c("beta","r2","p")]=c(beta, 
                              format(r2, digits=2), 
                              format(p, digits=3))
}

# Run a few extra associations
if(F)
{
  df2=merge(df2, pcs[,c(2,13:42)], all.x=T)
  lmp <- function (modelobject) {
    if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
    f <- summary(modelobject)$fstatistic
    p <- pf(f[1],f[2],f[3],lower.tail=F)
    attributes(p) <- NULL
    return(p)}
  tmp=c()
  # All 40 PCs
  for(iv in c("vitaminD","norm_vitD","log_vitD"))
  {
    fixed=paste(paste0("PC",1:40), collapse="+")
    model = as.formula(paste("df[,",iv,"]~",fixed))
    test=lm(model, data=df2)      
    r2=summary(test)$r.squared
  p=lmp(test)
  tmp=data.frame(covariate="First 40 PCs", test="lm", beta=NA, r2=r2, p=p, IV=iv)
  }
  

}

# Show results
out$p=format(out$p, digits=3)
out$p=as.character(out$p)
out=dcast(setDT(out), covariate +test ~ IV, value.var=c("beta","r2","p"))
datatable(out, 
          rownames=FALSE, 
          extensions=c('Buttons','FixedColumns'),
          options=list(dom='frtipB',
                       buttons=c('csv', 'excel'),
                       scrollX=TRUE),
          caption="Covariate association results with vitamin D (raw, RINT, and log-transformed)") 
# Compare vitamin D across assessment centres
  # Order centres by average normvitD 
  tmp=aggregate(norm_vitD~assessCentre, df2, mean)
  tmp=tmp[order(tmp$norm_vitD),]
  df2$assessCentre=factor(df2$assessCentre, levels=tmp$assessCentre)
  
  # Plot normvitD by centre
  ggplot(df2, aes(x=assessCentre, y=norm_vitD)) +
    geom_boxplot()

# Compare vitamin D in different geotyping batches
  # Order batched by average normvitD
    tmp=aggregate(norm_vitD~batch, df2, mean)
    tmp=tmp[order(tmp$norm_vitD),]
    df2$batch=factor(df2$batch, levels=as.character(tmp$batch))
    
  # Plot normvitD by batch
    ggplot(df2, aes(x=batch, y=norm_vitD)) +
    geom_boxplot()

Prepare phenotype

In this section, we prepared the phenotypes for the following analyses:

fastGWA and BOLT-LMM

# Copy a fam file from Delta to local to get the IDs of individuals with European ancestry (run command locally)
cd $local/input
scp $deltaID:$medici/v3EUR_HM3/ukbEUR_imp_chr1_v3_HM3.fam .

The main GWA analyses were conducted with mixed linear models (MLM):

To generate the phenotype for those analyses, we:

  • Restricted our sample to the full set of individuals with European ancestry
  • Normalised phenotype with a rank-based inverse normal transformation

Covariates were included in the GWA model to correct for confounder effects. The list of confouders used is in the Covariate tab.

Note that we still fit PCs as covariates in these analyses (see full list of variables that included in MLM analyses in the ‘Covariates’ tab). As mentioned in Jiang 2019 bioRxiv: “…although in theory MLM-based methods accounts for population stratification by fitting all (or a subset of selected) SNPs as random effects, MLM-based association analyses in large samples suggested that fitting PCs as covariates improves robustness.

# Restrict to ALL individuals of European ancestry
v3EUR_HM3_IDs=read.table("input/UKB_v3EUR_HM3/ukbEUR_imp_chr1_v3_HM3.fam", h=F, stringsAsFactors=F)
names(v3EUR_HM3_IDs)[1:2]=c("FID","IID")
df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)
df2=df[df$IID %in% v3EUR_HM3_IDs$IID,]

# Save log-transformed and RINT phenotype (to assess validity of meta-analysis)
if(saveResults){write.table(df2[,c("IID","log_vitD","norm_vitD")], "tmpDir/mlm_norm_log_pheno", row.names=F, quote=F)}

# Save normalised residuals to feed as phenotype to the fastGWA and BOLT-LMM
df2=df2[,c("IID","norm_vitD")]
df2$FID=df2$IID
df2=df2[,c("FID","IID","norm_vitD")]
if(saveResults){write.table(df2, "input/mlm_pheno", row.names=F, quote=F)}
# Copy phenotype files to Delta (cluster where we will run the GWAS)
cd $local/input
scp ./mlm_pheno $deltaID:$WD/input

Meta-analysis

To meta-analyse UKB results with results obtained by the SUNLIGHT consortium, we analysing the data in the same way, i.e.:

  • Restricting data to individuals of European ancestry
  • Using a log-transformed phenotype
  • Including the following covariates in the model (fastGWA):
    • Assessment month
    • Age
    • Sex
    • BMI
    • 40 PCs
    • Genotyping batch

Season-stratified GWAS

# Restrict to ALL individuals of European ancestry
df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)
#df2=df[df$IID %in% v3EUR_HM3_IDs$IID,] # Already done above
df2=df
df2$FID=df2$IID

# Define seasons
## Winter = Dec - Apr
## Summer = Jun - Oct
df2$season=NULL
df2[df2$assessMonth<=4 | df2$assessMonth>=12,"season"]="winter"
df2[df2$assessMonth>=6  & df2$assessMonth<=10,"season"]="summer"
df2=df2[!is.na(df2$season),]

# stratify data and apply RINT
  # Summarise
  winter=df2[df2$season=="winter",c("FID","IID","vitaminD","norm_vitD","log_vitD")]
  summer=df2[df2$season=="summer",c("FID","IID","vitaminD","norm_vitD","log_vitD")]
  summary(summer$vitaminD); sd(summer$vitaminD); summary(winter$vitaminD); sd(winter$vitaminD)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   44.10   56.90   58.04   70.20  340.00
## [1] 19.79975
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   26.20   37.70   41.16   53.00  335.00
## [1] 19.3461
  summary(summer$norm_vitD); sd(summer$norm_vitD); summary(winter$norm_vitD); sd(winter$norm_vitD)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -4.2030 -0.1662  0.3949  0.4127  0.9840  4.7169
## [1] 0.872875
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -4.7169 -1.0934 -0.4647 -0.4178  0.2265  4.4882
## [1] 0.9858428
  summary(summer$log_vitD); sd(summer$log_vitD); summary(winter$log_vitD); sd(winter$log_vitD)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.303   3.786   4.041   3.999   4.251   5.829
## [1] 0.3666251
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.303   3.266   3.630   3.606   3.970   5.814
## [1] 0.4809793
winter=df2[df2$season=="winter",c("FID","IID","norm_vitD")]
summer=df2[df2$season=="summer",c("FID","IID","norm_vitD")]
norm = function(x){x[order(x,na.last=NA)]=qnorm(ppoints(length(x[!is.na(x)])),lower=T);x }
winter$norm_vitD=norm(winter$norm_vitD)
summer$norm_vitD=norm(summer$norm_vitD)

# Save "season" definition
df2$season_binary=ifelse(df2$season=="winter", 1, 2)
df2=df2[,c("FID","IID","norm_vitD","season","season_binary")]
if(saveResults){write.table(df2, "input/covariates_season", row.names=F, quote=F)}

# Save datasets to feed as phenotype to fastGWA
if(saveResults){write.table(winter, "input/mlm_winter_pheno", row.names=F, quote=F)}
if(saveResults){write.table(summer, "input/mlm_summer_pheno", row.names=F, quote=F)} 

After a visual inspection of the average vitamin D levels per month (see Time series tab), we defined two seasons of blood draw and stratified the UKB sample into two cohorts based on those:

  • “Winter” - individuals assessed Dec-Apr (N=162591)
  • “Summer” - individuals assessed Jun-Oct (N=177082)

Individuals with vitamin D levels assessed in the months of May and November were not included in these analyses as the transition between seasons varies between years and could confound the results.

The two phenotype files (one for each season) were generated by (1) restricting to individuals of European ancestry, (2) restricting the individuals assessed in the defined season and (3) applying a rank-based inverse-normal transformation.

# Copy phenotype files to Delta (cluster where we will run the GWAS)
scp $local/input/mlm*er_pheno $deltaID:$WD/input/

# Copy covariate created to represent "Season" to Delta
scp $local/input/covariates_season $deltaID:$WD/input/

vQTL GWAS

The phenotype used in the variance quantitative trait loci (vQTL) GWAS was pre-regressed of confounder effects. Briefly, we:

  • Restricted to unrelated European individuals
    • Using IDs in the fam for chr1 in Q0286/UKBiobank/v3EURu_HM3/
  • Regressed-out the effect of confounders within the following groups:
    • Females-vitD: females who ever reported taking vitamin D supplements (vitD group)
    • Females-other: females who ever reported taking supplements, but never vitamin D (other group)
    • Females-none: females who always reported not taking supplements (none group)
    • Females-NA females with no information on supplement intake (NA group)
    • Males-vitD: males who ever reported taking vitamin D supplements (vitD group)
    • Males-other: males who ever reported taking supplements, but never vitamin D other group)
    • Males-none: males who always reported not taking supplements (none group)
    • Males-NA: males with no information on supplement intake NA group)
  • Removed outliers more than 5 SD from the mean
  • Standardised the residuals to get mean 0 and SD 1 (note that this is not a non-linear transformation and will help with interpretation of downstream analyses)
  • Merged residuals from the different groups

Variables regressed from the phenotype were:

  • Age
  • Assessment month
  • Assessment centre
  • First 40 PCs: Principal components (PCs) were included in the model to account for population stratification. In our analyses, we used PCs (Q0286/UKBiobank/v2Samples/ukbEUR_PC40_HM3excludeRegions_v2.txt) that Kathryn Kemper generated with FlashPCA, using ~500K HapMap3 SNPs and excluding regions of long-range LD (LD<0.9).
  • Genotyping batch
# Add PCs to our data frame to remove their effect
df=read.table("input/vitaminD_tidyData_noMissing", h=T, stringsAsFactors=F)
#load("input/ukbEUR_PC40_HM3excludeRegions_v2.RData")
df2=merge(df, pcs, all.x=T)

# Restrict to unrelated individuals of European ancestry
v3EURu_HM3_IDs=read.table("input/ukbEURu_imp_chr1_v3_HM3.fam", h=F, stringsAsFactors=F)
names(v3EURu_HM3_IDs)[1:2]=c("FID","IID")
df2=df2[df2$IID %in% v3EURu_HM3_IDs$IID,]

There were 348501 unrelated individuals of European ancestry. Of those, 318851 had vitamin D levels assessed and will be analysed in this preliminary analysis.

# Show sample size of the groups within which we will run covariate regression
tmp=table(df2$sex, df2$supplementsEver, exclude=NULL)
tmp=kable(tmp, caption="Sample size of groups within which we ran covariate regression")
kable_styling(tmp, full_width=F)
Sample size of groups within which we ran covariate regression
none other vitD NA
Female 34278 35309 6624 93901
Male 36969 24897 2655 84218
# Define the groups within which we will run covariate regression
df2[is.na(df2$supplementsEver),"supplementsEver"]="NA"
df2$group=paste(df2$sex, df2$supplementsEver, sep="_")

# Code categorical variables as factors
df2[,c("assessCentre","assessMonth","batch")] = lapply(df2[,c("assessCentre","assessMonth","batch")], factor)

# Regress effect of covariates
nPCs=40 # number of PCs to include
for(i in unique(df2$group))
{
  # Get residuals
  test=paste("vitaminD ~ age + assessCentre + assessMonth + batch +", paste(paste0("PC",1:nPCs),collapse = "+"))
  model=lm(as.formula(test), df2[df2$group==i,])
  res=resid(model)
  
  # Remove outliers (>5 SD from mean)
  low=mean(res)-5*sd(res)
  high=mean(res)+5*sd(res)
  res[res<low | res>high]=NA
  
  # Standardize residuals for mean 0 and SD 1 (note: this is not a non-lienar transformation and will help with the interpretation of the results)
  res_std=(res-mean(res, na.rm=T))/sd(res, na.rm=T)
  
  # Add standardized residuals to data frame
  df2[df2$group==i,"vitD_res"]=res_std
}
df2=df2[!is.na(df2$vitD_res),]



# Plot residuals by group
tmp=reshape2::melt(df2[,c("IID","vitaminD","vitD_res","group")], id.vars=c("IID", "group"))
tmp$variable=factor(tmp$variable, levels=c("vitaminD","vitD_res"), labels=c("Before regression","After regression"))
ggplot(tmp, aes(x=group, y=value)) +
  geom_boxplot() +
  facet_wrap(variable~., scales="free") +
  labs(title="Vitamin D levels before and after covariate regression",
       subtitle="within regression group",
       x="",y="Vitamin D")

# Plot adjusted phenotype
ggplot(df2, aes(x=vitD_res)) +
  geom_density(fill="lightcoral", color=alpha("grey", 0),alpha = 0.6) +
  labs(title="Residuals from covariate regression",
       subtitle="Adjusted phenotype to use in GWAS",
       x="",y="Vitamin D (normalised residuals)")

# Save  residuals to feed as phenotype for the vQTL GWAS
df2=df2[,c("IID","vitD_res")]
df2$FID=df2$IID
df2=df2[,c("FID","IID","vitD_res")]
if(saveResults){write.table(df2, "input/vqtl_pheno", row.names=F, quote=F)}
# Copy phenotype file to Delta (cluster where we will run the GWAS)
scp $local/input/vqtl_pheno $deltaID:$WD/input/