1 Setup

1.1 Create directory structure and clone repo

(Working directory on EBI cluster: /hps/research1/birney/users/ian/mikk_paper)

# move to working directory
cd /your/working/directory
# clone git repository
git clone https://github.com/Ian-Brettell/mikk_genome.git

1.2 Create conda evironment

conda env create \
  -n mikk_env \
  -f mikk_genome/code/config/conda_env.yml
  
conda activate mikk_env

1.3 Setup `R`

# Load required libraries
require(here)
source(here::here("code/scripts/ld_decay/source.R"))

1.4 Copy MIKK panel VCF into working directory

(See supplementary material for how VCF was generated.)

# create directory for VCFs
mkdir vcfs

# Copy into working directory
cp /nfs/research1/birney/projects/medaka/inbred_panel/medaka-alignments-release-94/vcf/medaka_inbred_panel_ensembl_new_reference_release_94.vcf* vcfs

1.5 Key-value file for cram ID to line ID

mikk_genome/data/20200206_cram_id_to_line_id.txt

1.6 Remove sibling lines and replicates

Full list of 80 extant MIKK panel lines: mikk_genome/data/20200210_panel_lines_full.txt

Note: Line 130-2 is missing from the MIKK panel VCF.

Identify sibling lines

cat mikk_genome/data/20200210_panel_lines_full.txt | cut -f1 -d"-" | sort | uniq -d

Only keep first sibling line ( suffix _1); manually remove all others and write list of non-sibling lines to here: mikk_genome/data/20200227_panel_lines_no-sibs.txt. 64 lines total.

Excluded sibling lines here: mikk_genome/data/20200227_panel_lines_excluded.txt. 16 lines total.

Replace all dashes with underscores to match mikk_genome/data/20200206_cram_id_to_line_id.txt key file

sed 's/-/_/g' mikk_genome/data/20200227_panel_lines_no-sibs.txt \
  > mikk_genome/data/20200227_panel_lines_no-sibs_us.txt

Extract the lines to keep from the key file.

awk  'FNR==NR {f1[$0]; next} $2 in f1' \
  mikk_genome/data/20200227_panel_lines_no-sibs_us.txt \
  mikk_genome/data/20200206_cram_id_to_line_id.txt \
    > mikk_genome/data/20200227_cram2line_no-sibs.txt

Has 66 lines instead of 63 (64 lines minus 130-2, which isn’t in the VCF), so there must be replicates Find out which ones:

cat mikk_genome/data/20200227_cram2line_no-sibs.txt | cut -f2 | cut -f1 -d"_" | sort | uniq -d

32 71 84

Manually removed duplicate lines (mikk_genome/data/20200227_duplicates_excluded.txt):

24271_7#5 32_2
24271_8#4 71_1
24259_1#1 84_2

Final no-sibling-lines CRAM-to-lineID key file: mikk_genome/data/20200227_cram2line_no-sibs.txt

2 Create MIKK panel VCF with no sibling lines

# create no-sibs file with CRAM ID only
cut -f1 mikk_genome/data/20200227_cram2line_no-sibs.txt \
  > mikk_genome/data/20200227_cram2line_no-sibs_cram-only.txt
  
# make new VCF having filtered out non-MIKK and sibling lines
bcftools view \
  --output-file vcfs/panel_no-sibs.vcf \
  --samples-file mikk_genome/data/20200227_cram2line_no-sibs_cram-only.txt \
  vcfs/medaka_inbred_panel_ensembl_new_reference_release_94.vcf
  
# recode with line IDs
bcftools reheader \
  --output vcfs/panel_no-sibs_line-ids.vcf \
  --samples mikk_genome/data/20200227_cram2line_no-sibs.txt \
  vcfs/panel_no-sibs.vcf
  
# compress
bcftools view \
  --output-type z \
  --output-file vcfs/panel_no-sibs_line-ids.vcf.gz \
  vcfs/panel_no-sibs_line-ids.vcf
  
# index
bcftools index \
  --tbi \
  vcfs/panel_no-sibs_line-ids.vcf.gz

# get stats
mkdir stats

bcftools stats \
  vcfs/panel_no-sibs_line-ids.vcf.gz \
  > stats/20200305_panel_no-sibs.txt

## get basic counts
grep "^SN" stats/20200305_panel_no-sibs.txt

2.1 Make a version with no missing variants

vcftools \
  --gzvcf vcfs/panel_no-sibs_line-ids.vcf.gz \
  --max-missing 1 \
  --recode \
  --stdout > vcfs/panel_no-sibs_line-ids_no-missing.vcf
  
# compress
bcftools view \
  --output-type z \
  --output-file vcfs/panel_no-sibs_line-ids_no-missing.vcf.gz \
  vcfs/panel_no-sibs_line-ids_no-missing.vcf

# create index
bcftools index \
  --tbi vcfs/panel_no-sibs_line-ids_no-missing.vcf.gz
  
# get stats 
bcftools stats \
  vcfs/panel_no-sibs_line-ids_no-missing.vcf.gz \
  > stats/20200305_panel_no-sibs_no-missing.txt

# get basic counts
grep "^SN" stats/20200305_panel_no-sibs_no-missing.txt

3 Generate Haploview plots

3.1 Create `plink` dataset from no-sib-lines, no-missing VCF

mkdir plink/20200716_panel_no-sibs_line-ids_no-missing

# make BED  
plink \
  --vcf vcfs/panel_no-sibs_line-ids_no-missing.vcf.gz \
  --make-bed \
  --double-id \
  --snps-only \
  --biallelic-only \
  --chr-set 24 no-xy \
  --chr 1-24 \
  --out plink/20200716_panel_no-sibs_line-ids_no-missing/20200716
  
# recode for 012 transposed
plink \
  --bfile plink/20200716_panel_no-sibs_line-ids_no-missing/20200716 \
  --recode A-transpose \
  --out plink/20200716_panel_no-sibs_line-ids_no-missing/20200716_recode012
# creates plink/20200716_panel_no-sibs_line-ids_no-missing/20200716_recode012.traw

3.2 Create BED sets filtered for MAF > 0.03, 0.05 and 0.10

maf_thresholds=$( echo 0.03 0.05 0.10 )

# Make new BEDs 
for i in $maf_thresholds ; do
  # make directory
  new_path=plink/20200716_panel_no-sibs_line-ids_no-missing/20200803_maf-$i ;
  # make directory
  if [ ! -d "$new_path" ]; then
    mkdir $new_path;
  fi
  # make BED set
  plink \
    --bfile plink/20200716_panel_no-sibs_line-ids_no-missing/20200716 \
    --make-bed \
    --double-id \
    --chr-set 24 no-xy \
    --maf $i \
    --out $new_path/20200803
done

3.3 Recode for Haploview

# Create output directory
mkdir plink/20200716_panel_no-sibs_line-ids_no-missing/20200803_hv_thinned

hv_thinned_path=plink/20200716_panel_no-sibs_line-ids_no-missing/20200803_hv_thinned

# Recode
for i in $maf_thresholds ; do
  new_path=$hv_thinned_path/$i ;
  # make directory
  if [ ! -d "$new_path" ]; then
    mkdir $new_path;
  fi 
  # recode 
  for j in $(seq 1 24); do
    plink \
      --bfile plink/20200716_panel_no-sibs_line-ids_no-missing/20200803_maf-$i/20200803 \
      --recode HV-1chr \
      --double-id \
      --chr-set 24 no-xy \
      --chr $j \
      --allele1234 \
      --thin-count 3000 \
      --out $hv_thinned_path/$i/20200803_chr-$j;
  done;
done

# Edit .ped files to remove asterisks
for i in $maf_thresholds ; do
  for j in $(find $hv_thinned_path/$i/20200803_chr-*.ped); do
    sed -i 's/\*/0/g' $j;
  done;
done  

# Edit .info files to make the SNP's bp position its ID
for i in $maf_thresholds; do
  for j in $(find $hv_thinned_path/$i/20200803_chr*.info); do
    outname=$(echo $j\_with-id);
    awk -v OFS="\t" {'print $2,$2'} $j > $outname;
  done;
done

3.4 Plot

NOTE: This code requires Haploview, which you will need to install on your system: https://www.broadinstitute.org/haploview/haploview

hv_path=/nfs/software/birney/Haploview.jar # edit to your Haploview path

mkdir plots/20200803_ld_thinned/

for i in $maf_thresholds; do
  # set output directory
  new_path=plots/20200803_ld_thinned/$i ;
  # make directory
  if [ ! -d "$new_path" ]; then
    mkdir $new_path;
  fi   
  for j in $(seq 1 24); do
    bsub -M 20000 -o log/20200803_hv_$i\_$j.out -e log/20200803_hv_$i\_$j.err \
    "java -Xms18G -Xmx18G -jar $hv_path \
      -memory 18000 \
      -pedfile $hv_thinned_path/$i/20200803_chr-$j.ped  \
      -info $hv_thinned_path/$i/20200803_chr-$j.info_with-id \
      -maxDistance 1000 \
      -ldcolorscheme DEFAULT \
      -ldvalues RSQ \
      -minMAF $i \
      -nogui \
      -svg \
      -out $new_path/$j";
  done;
done

These svg files can be converted to pdf using:

https://www.zamzar.com/ for files > 30 MB (chr 1) - note limit on number of files you can convert
https://onlineconvertfree.com/convert-format/svg-to-pdf/ for the rest

The full Haploview LD plots are available in the Supplementary Material.

By inspecting these LD plots at the MAF > 0.05 level, we discovered the following LD blocks worthy of further investigation:

5:28181970-28970558 (788 Kb)
6:29398579-32246747 (2.85 Mb)
12:25336174-25384053 (48 Kb)
14:12490842-12947083 (456 Kb)
17:15557892-19561518 (4 Mb)
21:6710074-7880374 (1.17 Mb)

See zoomed plots here:

5:28181970-28970558

6:29398579-32246747

12:25336174-25384053

14:12490842-12947083

17:15557892-19561518

21:6710074-7880374

4 Genotype heatmaps for high-LD regions

See which lines are causing the high-LD regions at the MAF > 0.05 threshold (i.e. from a sample of 63 diploid individuals, variants with an allele count (AC) of at least 7).

4.1 Read data into BED matrix into `R`

# Read in BED matrix
mikk_full <- gaston::read.bed.matrix(file.path(lts_dir, "plink/20200716_panel_no-sibs_line-ids_no-missing/20200716"),
                                     rds = NULL)

# Read in genotypes file
mikk_geno <- readr::read_tsv(file = file.path(lts_dir, "plink/20200716_panel_no-sibs_line-ids_no-missing/20200716_recode012.traw"),
                             progress = T,
                             col_names = T)

# rename IDs
colnames(mikk_geno)[7:length(colnames(mikk_geno))] <- mikk_full@ped$id

4.2 Extract target regions and build into list

# get coordinates
high_ld_chrs <- c(5, 6, 12, 14, 17, 21)
high_ld_start <- c(28385805, 29608514, 25340000, 12584614, 15559963, 6800261)
high_ld_end <- c(28798048, 32212235, 25372985, 12861147, 19553529, 7760258)

# build into list
counter <- 0
high_ld_lst <- lapply(high_ld_chrs, function(x){
  counter <<- counter + 1
  x <- list("chr" = x,
            "start" = high_ld_start[counter],
            "end" = high_ld_end[counter])
  # find indexes for SNPs with MAF > 0.05
  x[["target_inds"]] <- which(mikk_full@snps$chr == x[["chr"]] &
                         dplyr::between(mikk_full@snps$pos, x[["start"]], x[["end"]]) &
                         mikk_full@snps$maf > 0.05)
  x[["target_snps"]] <- mikk_geno[x[["target_inds"]], ]  
  # make matrix
  x[["geno_mat"]] <- as.matrix(x[["target_snps"]][, -(1:6)])
  return(x)
})
names(high_ld_lst) <- high_ld_chrs

# save to repo
saveRDS(high_ld_lst, file.path(lts_dir, "20200727_high_ld_list.rds"))

4.3 Plot

Genotypes were recoded to 0, 1, 2 for REF, HET, and HOM_ALT respectively.

Dark red = 2 Orange = 1 Yellow = 0

# Write function to create heatmap
get_heatmap = function(in_list){
  # Get order of samples
  sample_order = colnames(in_list[["target_snps"]])[-(1:6)]  
  # Sort by count
  sorted_order = names(sort(colSums(in_list[["geno_mat"]]), decreasing = T))
  # Get re-ordered indein_listes
  new_ind = match(sorted_order, sample_order)
  # Plot
  heatmap(in_list[["geno_mat"]][, new_ind], 
          Rowv = NA,
          Colv = NA,
          scale = "row",
          keep.dendro = F)  
}

4.3.1 Chr 5

knitr::include_graphics(here::here("docs/plots/ld_decay/hv_5_28181970-28970558.png"))
x = high_ld_lst[["5"]]
get_heatmap(x)

5:28181970-28970558

4.3.2 Chr 6

knitr::include_graphics(here::here("docs/plots/ld_decay/hv_6_29398579-32246747.png"))
x = high_ld_lst[["6"]]
get_heatmap(x)

6:29398579-32246747

4.3.3 Chr 12

knitr::include_graphics(here::here("docs/plots/ld_decay/hv_12_25336174-25384053.png"))
x = high_ld_lst[["12"]]
get_heatmap(x)

12:25336174-25384053

4.3.4 Chr 14

knitr::include_graphics(here::here("docs/plots/ld_decay/hv_14_12490842-12947083.png"))
x = high_ld_lst[["14"]]
get_heatmap(x)

14:12490842-12947083

4.3.5 Chr 17

knitr::include_graphics(here::here("docs/plots/ld_decay/hv_17_15557892-19561518.png"))
x = high_ld_lst[["17"]]
get_heatmap(x)

17:15557892-19561518

4.3.6 Chr 21

knitr::include_graphics(here::here("docs/plots/ld_decay/hv_21_6710074-7880374.png"))
x = high_ld_lst[["21"]]
get_heatmap(x)

21:6710074-7880374

5 LD decay

We want to compare the rate at which LD decays with inter-SNP distance between the MIKK panel and humans. This will give an indication of the resolution at which one can map genetic traits using the MIKK panel, provided that at least two lines have the same variant of interest.

5.1 Obtain 1000 Genomes dataset

5.1.1 Download from FTP

cd vcfs

wget -r -p -k --no-parent -cut-dirs=5 ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/

5.1.2 Put list of files into list

find vcfs/ftp.1000genomes.ebi.ac.uk/ALL.chr*.vcf.gz > mikk_genome/data/20200205_vcfs.list

5.1.3 Merge VCFs

# Remove MT and Y from list 
sed -i '/MT/d' mikk_genome/data/20200205_vcfs.list

sed -i '/chrY/d' mikk_genome/data/20200205_vcfs.list

# run MergeVCFs 
java -jar /nfs/software/birney/picard-2.9.0/picard.jar MergeVcfs \
  I=mikk_genome/data/20200205_vcfs.list \
  O=vcfs/1gk_all.vcf.gz

5.2 Get LD stats using `Plink`

# make BED
mkdir plink/20200727_mikk_no-missing_maf-0.05

plink \
  --vcf vcfs/panel_no-sibs_line-ids.vcf.gz \
  --make-bed \
  --double-id \
  --snps-only \
  --biallelic-only \
  --maf 0.05 \
  --geno 0 \
  --chr-set 24 no-xy \
  --out plink/20200727_mikk_no-missing_maf-0.05/20200727

# get LD stats for MIKK
mkdir ld/20200727_mikk_maf-0.10_window-50kb_no-missing/

for i in $(seq 1 24); do
  plink \
      --bfile plink/20200727_mikk_no-missing_maf-0.05/20200727 \
      --r2 \
      --ld-window 999999 \
      --ld-window-kb 50 \
      --ld-window-r2 0 \
      --chr-set 24 no-xy \
      --chr $i \
      --maf 0.10 \
      --out ld/20200727_mikk_maf-0.10_window-50kb_no-missing/$i;
done

# get LD stats for 1KG
mkdir ld/20200727_1kg_maf-0.10_window-50kb_no-missing/

for i in $(seq 1 22); do
  plink \
      --bfile plink/20200723_1gk_no-missing_maf-0.05/20200723 \
      --r2 \
      --ld-window 999999 \
      --ld-window-kb 50 \
      --ld-window-r2 0 \
      --chr $i \
      --maf 0.10 \
      --out ld/20200727_1kg_maf-0.10_window-50kb_no-missing/$i;
done

# do again with ld-window-kb 10 to get counts of comparisons for paper
# MIKK  
mkdir ld/20200803_mikk_maf-0.10_window-10kb_no-missing/

for i in $(seq 1 24); do
  plink \
      --bfile plink/20200727_mikk_no-missing_maf-0.05/20200727 \
      --r2 \
      --ld-window 999999 \
      --ld-window-kb 10 \
      --ld-window-r2 0 \
      --chr-set 24 no-xy \
      --chr $i \
      --maf 0.10 \
      --out ld/20200803_mikk_maf-0.10_window-10kb_no-missing/$i;
done

# 1KG
mkdir ld/20200803_1kg_maf-0.10_window-10kb_no-missing/

for i in $(seq 1 22); do
  plink \
      --bfile plink/20200723_1gk_no-missing_maf-0.05/20200723 \
      --r2 \
      --ld-window 999999 \
      --ld-window-kb 10 \
      --ld-window-r2 0 \
      --chr $i \
      --maf 0.10 \
      --out ld/20200803_1kg_maf-0.10_window-10kb_no-missing/$i;
done

# Get total counts of pairwise comparisons:
wc -l ld/20200803_mikk_maf-0.10_window-10kb_no-missing/*.ld
# 204,152,898
wc -l ld/20200803_1kg_maf-0.10_window-10kb_no-missing/*.ld

5.3 Get mean LD within SNP-distance windows

5.3.1 0-10kb distance (main, MIKK v 1KG)

Rscript here: mikk_genome/code/scripts/20200727_r2_decay_mean_10kb-lim.R

5.3.1.1 MIKK

script=mikk_genome/code/scripts/20200727_r2_decay_mean_10kb-lim.R

mkdir ld/20200727_mean_r2_10kb-lim_mikk

for i in $(find ld/20200727_mikk_maf-0.10_window-50kb_no-missing/*.ld); do
  name=$(basename $i | cut -f1 -d".") ;
  out_dir=ld/20200727_mean_r2_10kb-lim_mikk ;
  bsub \
    -M 10000 \
    -o log/20200727_$name\_mean-r2_1kb-max.out \
    -e log/20200727_$name\_mean-r2_1kb-max.err \
    "Rscript --vanilla \
      $script \
      $i \
      $out_dir";
done

5.3.1.2 1KG

mkdir ld/20200727_mean_r2_10kb-lim_1kg

for i in $(find ld/20200727_1kg_maf-0.10_window-50kb_no-missing/*.ld); do
  name=$(basename $i | cut -f1 -d".") ;
  out_dir=ld/20200727_mean_r2_10kb-lim_1kg ;
  bsub \
    -M 30000 \
    -o log/20200727_$name\_mean-r2_10kb-max.out \
    -e log/20200727_$name\_mean-r2_10kb-max.err \
    "Rscript --vanilla \
      $script \
      $i \
      $out_dir";
done

5.3.2 0-1kb distance (inset, MIKK only)

Rscript: mikk_genome/code/scripts/20200803_r2_decay_mean_1gk_1kb-lim.R

mkdir ld/20200803_mean_r2_1kb-lim_mikk

out_dir=ld/20200803_mean_r2_1kb-lim_mikk
script=mikk_genome/code/scripts/20200803_r2_decay_mean_1gk_1kb-lim.R

for i in $(find ld/20200727_mikk_maf-0.10_window-50kb_no-missing/*ld); do
  name=$(basename $i | cut -f1 -d".");
  bsub \
    -M 30000 \
    -o log/20200803_$name\_mean-r2_1kb-max.out \
    -e log/20200803_$name\_mean-r2_1kb-max.err \
    "Rscript --vanilla \
      $script \
      $i \
      $out_dir";
done

5.4 Create LD plots in `R`

5.4.1 Main

5.4.1.1 Read in and process data

# Setup
require(here)
source(here::here("code", "scripts", "ld_decay", "source.R"))

# Create function to read in data and bind into single DF

read_n_bind = function(data_path_pref, dataset){
  # Set path
  path = paste(data_path_pref, dataset, sep = "")
  
  # Read in data
  data_files <- list.files(path,
                           full.names = T)
  data_files_trunc <- list.files(path)
  data_files_trunc <- gsub(".txt", "", data_files_trunc)
  
  data_list <- lapply(data_files, function(data_file){
    df <- read.delim(data_file,
                     sep = "\t",
                     header = T)
    return(df)
  })
  names(data_list) <- as.integer(data_files_trunc)
  
  # reorder
  data_list <- data_list[order(as.integer(names(data_list)))]
  
  # bind into DF
  out_df = dplyr::bind_rows(data_list, .id = "chr")
  out_df$chr <- factor(out_df$chr, levels = seq(1, 24))
  
  # get kb measure
  out_df$bin_bdr_kb <- out_df$bin_bdr / 1000  
  
  return(out_df)
}

# Run over both datasets
datasets = c("mikk", "1kg")
final_lst = lapply(datasets, function(x) read_n_bind("ld/20200727_mean_r2_10kb-lim_", x))
names(final_lst) = datasets

# Combine into single DF
r2_final_df <- dplyr::bind_rows(final_lst, .id = "dataset")

# Write table to repo
write.table(r2_final_df,
            file = here::here("mikk_genome", "data", "20200803_r2_10kb-lim.csv"),
            quote = F, sep = ",", row.names = F, col.names = T)

5.4.1.2 Plot

# Tidy data for final plot
r2_final_df$chr = factor(r2_final_df$chr, levels = seq(1, 24))
r2_final_df$dataset = toupper(r2_final_df$dataset)

# Plot
r2_plot_main = r2_final_df %>% ggplot() +
  geom_line(aes(bin_bdr_kb, mean, colour = chr)) +
  theme_cowplot() +
  xlab("Distance between SNPs (kb)") +
  ylab(bquote(.("Mean r")^2)) +
  facet_wrap(~dataset, nrow = 1, ncol = 2) +
  theme(panel.grid = element_blank(),
        strip.background = element_blank(),
        legend.position = c(0.9, .8)) +
  labs(colour = "Chromosome") +
  scale_y_continuous(breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6),
                     limits = c(0.05, 0.6))

#r2_plot_main

ggplotly(r2_plot_main)

## Warning in if (robust_nchar(axisTitleText) > 0) {: the condition has length > 1 and only the first element will be used

## Warning in if (robust_nchar(axisTitleText) > 0) {: the condition has length > 1 and only the first element will be used

# Save plot to repo
ggsave(filename = paste("20200803_mean-r2_10kb-lim_1KGvMIKK_single", ".svg", sep = ""),
       plot = r2_plot_main,
       device = "svg",
       path = here::here("plots", "ld_decay"),
       width = 25,
       height = 13,
       units = "cm")

5.4.2 Inset

5.4.2.1 100-bp windows

# Read in data
r2_df_1kb_mikk = read_n_bind("ld/20200803_mean_r2_1kb-lim_", "mikk")

# Write table to repo
write.table(r2_df_1kb_mikk,
            file = here::here("mikk_genome", "data", "20200803_r2_1kb-lim_mikk.csv"),
            quote = F, sep = ",", row.names = F, col.names = T)

# Process for plotting
r2_df_1kb_mikk$chr <- factor(r2_df_1kb_mikk$chr, levels = seq(1, 24))

# Plot
r2_1kb_mikk = r2_df_1kb_mikk %>% ggplot() +
  geom_line(aes(bin_bdr, mean, colour = chr)) +
  theme_bw() +
  xlab("Distance beetween SNPs (bp)") +
  ylab(bquote(.("Mean r")^2)) +
  labs(colour = "Chromosome") +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14)) +
  guides(colour = F) +
  scale_x_continuous(limits = c(0, 1000)) +
  scale_y_continuous(breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6),
                     limits = c(0.05, 0.6))

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

r2_1kb_mikk

# Save to repo
ggsave(filename = paste("20200803_mean-r2_1kb-lim_MIKK_inset_100bp-bins", ".png", sep = ""),
       plot = r2_1kb_mikk,
       device = "png",
       path = here::here("mikk_genome", "plots"),
       width = 10.88,
       height = 8,
       units = "cm",
       dpi = 500)

5.4.2.2 10-bp windows

For a finer resolution.

5.4.2.2.1 Get means for each bin

script=mikk_genome/code/scripts/20200724_r2_decay_mean_1gk_1kb-lim.R
out_dir=ld/20200727_mean_r2_1kb-lim_mikk

for in_file in $(find ld/20200727_mikk_maf-0.10_window-50kb_no-missing/*ld); do
  name=$(basename $in_file | cut -f1 -d".");
  bsub \
    -M 30000 \
    -o log/20200803_$name\_mean-r2_1kb-max.out \
    -e log/20200803_$name\_mean-r2_1kb-max.err \
    "Rscript \
      --vanilla \
      $script \
      $in_file \
      $out_dir";
done

# Combine in R
data_files <- list.files("ld/20200727_mean_r2_1kb-lim_mikk",
                         full.names = T)

data_files_trunc <- list.files("ld/20200727_mean_r2_1kb-lim_mikk")

data_files_trunc <- gsub(".txt", "", data_files_trunc)

data_list <- lapply(data_files, function(data_file){
  df <- read.delim(data_file,
                   sep = "\t",
                   header = T)
  return(df)
})

names(data_list) <- as.integer(data_files_trunc)

# reorder
data_list <- data_list[order(as.integer(names(data_list)))]

# bind into DF
r2_df_1kb_mikk <- dplyr::bind_rows(data_list, .id = "chr")
r2_df_1kb_mikk$chr <- factor(r2_df_1kb_mikk$chr, levels = seq(1, 24))

# write to table
write.table(r2_df_1kb_mikk, here::here("mikk_genome", "data", "20200803_mikk_ld-decay_1kb-lim_10bp-windows.txt"),
            quote = F, row.names = F, col.names = T, sep = "\t")

5.4.2.2.2 Plot

# Read in data
r2_df_1kb_mikk = read.table(here::here("data", "20200803_mikk_ld-decay_1kb-lim_10bp-windows.txt"),
                            header = T, sep = "\t", as.is = T)


# Factorise chromosomes
r2_df_1kb_mikk$chr <- factor(r2_df_1kb_mikk$chr, levels = seq(1, 24))

# Plot
r2_df_1kb_mikk %>% ggplot() +
  geom_line(aes(bin_bdr, mean, colour = chr)) +
  theme_bw() +
  xlab("Distance beetween SNPs (bp)") +
  ylab(bquote(.("Mean r")^2)) +
  labs(colour = "Chromosome") +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 16)) +
  guides(colour = F) +
  scale_y_continuous(breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7),
                     limits = c(0.05, 0.7))

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

## Warning: Removed 3 row(s) containing missing values (geom_path).

# Save
ggsave(filename = paste("20200803_mean-r2_1kb-lim_MIKK_inset_10bp-windows", ".png", sep = ""),
       device = "png",
       path = here::here("mikk_genome", "plots"),
       width = 10.88,
       height = 8,
       units = "cm",
       dpi = 500)

6 MAF distribution MIKK v 1KG

6.1 Get frequencies with `plink`

# 1KG
plink \
  --bfile plink/20200723_1gk_no-missing/20200723 \
  --freq \
  --out maf/20200727_1kg_no-missing
# Creates a 3.9GB file.  
  
# MIKK
plink \
  --bfile plink/20200716_panel_no-sibs_line-ids_no-missing/20200716 \
  --freq \
  --chr-set 24 no-xy \
  --out maf/20200727_mikk_no-missing
# Creates a 657MB file.

6.2 Plot

in_mikk <- "../maf/20200727_mikk_no-missing.frq"
in_1kg <- "../maf/20200727_1kg_no-missing.frq"
#out_file <- args[3]

## MIKK
maf_mikk <- readr::read_delim(in_mikk,
                             delim = " ",
                             trim_ws = T,
                             col_types = cols_only(MAF = col_double()))
maf_mikk$dataset <- "MIKK panel"

## 1KG
maf_1kg <- readr::read_delim(in_1kg,
                             delim = " ",
                             trim_ws = T,
                             col_types = cols_only(MAF = col_double()))
maf_1kg$dataset <- "1000 Genomes"

## Bind
maf_final <- rbind(maf_mikk, maf_1kg)

# Plot
maf_plot = maf_final %>%
  ggplot() +
    geom_histogram(aes(x = MAF,
                       y=0.01*..density..,
                       fill = dataset),
                   binwidth = 0.01) +
    theme_cowplot() +
    guides(fill = F) +
    facet_wrap(~dataset, nrow = 1, ncol = 2) +
    xlab("Minor allele frequencies") +
    ylab("Density") +
    theme(strip.background = element_blank(),
          strip.text = element_text(size = 14,
                                    face = "bold"))

6.2.1 LD decay without labels

r2_plot_main_nolabs = r2_final_df %>% ggplot() +
  geom_line(aes(bin_bdr_kb, mean, colour = chr)) +
  theme_cowplot() +
  xlab("Distance between SNPs (kb)") +
  ylab(bquote(.("Mean r")^2)) +
  facet_wrap(~dataset, nrow = 1, ncol = 2) +
  theme(panel.grid = element_blank(),
        strip.background = element_blank(),
        strip.text.x = element_blank(),
        legend.position = c(.9, .8),
        legend.key.size = unit(9, "points"),
        legend.title = element_text(size = 9),
        legend.text = element_text(size = 9)) +
  labs(colour = "Chromosome") +
  scale_y_continuous(breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6),
                     limits = c(0.05, 0.6))

6.3 Combine with LD decay for final figure

final_fig = cowplot::ggdraw() +
  draw_plot(maf_plot,
            x = 0, y = .7, width = 1, height = .3) +
  draw_plot(r2_plot_main_nolabs,
            x = 0, y = 0, width = 1, height = .7) +
  draw_plot_label(label = c("A", "B"), size = 15,
                  x = c(0, 0), y = c(1, .7))

out_path = here::here("docs/plots/ld_decay/20210305_final_figure.png")

ggsave(out_path,
       plot = final_fig,
       device = "png",
       width = 23,
       height = 22,
       units = "cm",
       dpi = 500)

knitr::include_graphics(here::here("docs/plots/ld_decay/20210305_final_figure.png"))

7 Investigation of LD decay in chr 2

Chromosome 2 has an obviously faster LD decay than the other chromosomes. We explore some possible reasons for this.

7.1 Get lengths of each chr on `bash`

seq 1 24 > tmp1.txt

grep ">" refs/Oryzias_latipes.ASM223467v1.dna.toplevel.fa | scut -f6 -d":" | head -24 > tmp2.txt

paste tmp1.txt tmp2.txt > mikk_genome/data/Oryzias_latipes.ASM223467v1.dna.toplevel.fa_chr_counts.txt

7.2 Get proportion of each chromosome covered by exons using `biomaRt`

# Load libraries
library(here)
source(here::here("code", "scripts", "ld_decay", "source.R"))

# Get length of chromosomes
chr_counts <- readr::read_tsv(here::here("data",
                                         "Oryzias_latipes.ASM223467v1.dna.toplevel.fa_chr_counts.txt"),
                              col_names = c("chr", "length")) %>% 
  dplyr::filter(chr != "MT") %>% 
  dplyr::mutate(chr = as.integer(chr))

# Select database and list datasets within
ensembl_mart <- useMart("ENSEMBL_MART_ENSEMBL")

# Select dataset
ensembl_olat <- useDataset("olatipes_gene_ensembl", mart = ensembl_mart)
olat_mart = useEnsembl(biomart = "ensembl", dataset = "olatipes_gene_ensembl")
# Get attributes of interest (exon ID, chr, start, end)
exons <- getBM(attributes = c("chromosome_name",
                              "ensembl_gene_id", 
                              "ensembl_transcript_id", 
                              "transcript_start", 
                              "transcript_end", 
                              "transcript_length", 
                              "ensembl_exon_id", 
                              "rank", 
                              "strand", 
                              "exon_chrom_start", 
                              "exon_chrom_end", 
                              "cds_start", 
                              "cds_end"),
               mart = olat_mart)

# Factorise chr so it's in the right order
chrs <- unique(exons$chromosome_name)
auto_range <- range(as.integer(chrs), na.rm = T)
non_auto <- chrs[is.na(as.integer(chrs))]
chr_order <- c(seq(auto_range[1], auto_range[2]), non_auto)
exons$chromosome_name <- factor(exons$chromosome_name, levels = chr_order)

# Convert into list
exons_lst <- split(exons, f = exons$chromosome_name)

# Get mean length of exons per chromosome
exons_lst <- lapply(exons_lst, function(chr){
  chr <- chr %>%
    dplyr::mutate(exon_length = (exon_chrom_end - exon_chrom_start) + 1,
                  transcript_total_length = (transcript_end - transcript_start) + 1)
  return(chr)
})

# Get total length of chr covered by exons
exon_lengths <- lapply(exons_lst, function(chr){
  # create list of start pos to end pos sequences for each exon
  out_list <- apply(chr, 1, function(exon) {
    seq(exon[["exon_chrom_start"]], exon[["exon_chrom_end"]])
  })
  # combine list of vectors into single vector and get only unique numbers
  out_vec <- unique(unlist(out_list))
  # get length of out_vec and put it into data frame
  out_final <- data.frame("exon_cov" = length(out_vec))
  return(out_final)
})

# combine into single DF
exons_len_df <- dplyr::bind_rows(exon_lengths, .id = "chr") %>% 
  dplyr::mutate(chr = as.integer(chr))

# join with chr_counts and get proportion of chr covered by exons
chr_stats <- dplyr::left_join(chr_counts, exons_len_df, by = "chr") %>% 
  dplyr::mutate(prop_cov_exon = exon_cov / length)
# convert chr to factor for plotting
chr_stats$chr <- factor(chr_stats$chr)

7.3 Get SNP counts per megabase

7.3.1 Get counts

bcftools index \
  --stats \
  ../vcfs/panel_no-sibs_line-ids_no-missing_bi-snps_with-af.vcf.gz \
    > data/20201106_non-missing_bi-snp_count.txt

7.3.2 Read SNP counts data into `R`

snp_counts = read.table(here::here("data", "20201106_non-missing_bi-snp_count.txt"),
                        sep = "\t",
                        col.names = c("chr", "length", "snp_count")) %>% 
  # create megabase column
  dplyr::mutate(megabases = length / 1e6,
                snps_per_megabase = snp_count / megabases) %>% 
  # remove MT
  dplyr::filter(chr != "MT") %>% 
  # turn chr column into integer
  dplyr::mutate(chr = as.factor(as.integer(chr)))

7.4 Combine SNP counts with exon proportion counts

chr_df = snp_counts %>% 
  dplyr::full_join(chr_stats, by = c("chr", "length"))

# Create recode vector
recode_vec = c("Non-missing, biallelic SNPs per megabase",
               "Proportion of chromosome covered by exons")
names(recode_vec) = c("snps_per_megabase",
                      "prop_cov_exon")

7.5 Plot

chr_df %>% 
  tidyr::pivot_longer(cols = c(snps_per_megabase, prop_cov_exon), 
                      names_to = "variable",
                      values_to = "values") %>% 
  dplyr::mutate(variable = dplyr::recode(variable, !!!recode_vec)) %>% 
  ggplot() +
    geom_col(aes(chr, values, fill = chr)) +
    guides(fill = F) + 
    xlab("Chromosome") +
    ylab(NULL) +
    theme_bw() +
    facet_wrap(~variable,
               nrow = 2, ncol = 1,
               scales = "free_y")

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

chr_df %>% 
  ggplot(aes(snps_per_megabase, prop_cov_exon, colour = chr, label = chr)) +
  geom_point() +
  geom_text(hjust = -0.5) +
  theme_bw() +
  guides(colour = F) +
  xlab("Non-missing, biallelic SNPs per megabase") +
  ylab("Proportion of chromosome covered by exons")

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

SNPs per Mb vs proportion of chr covered by exons

# Save to repo
ggsave(filename = paste("20201106_snps-per-mb_v_exon-props", ".png", sep = ""),
       device = "png",
       path = here("mikk_genome", "plots"),
       width = 24,
       height = 20,
       units = "cm",
       dpi = 500)

7.6 Calculate correlation

cor.test(chr_df$snps_per_megabase, chr_df$prop_cov_exon, method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  chr_df$snps_per_megabase and chr_df$prop_cov_exon
## S = 3274, p-value = 0.04033
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.4234783

8 Calculate linkage distance in chr2 relative to other chromosomes

From Naruse et al., Genetics, 2000 (https://www.genetics.org/content/154/4/1773) Table 4:

cms = c(44.2, 85.4, 84.3, 73.5, 66, 82.4, 39, 57.4, 54.3, 71.5, 39.4, 36.6, 48.9, 68.1, 58.1, 57.9, 74.7, 59.8, 47.4, 71.8, 32.3, 29.7, 47.4, 24.4)
names(cms) = 1:24

# cM/Mb
cms_per_mb = cms / (chroms$end / 1e6)
cms_per_mb

##        1        2        3        4        5        6        7        8        9       10       11       12       13 
## 1.172005 3.364978 2.203999 2.236159 1.987647 2.555297 1.128035 2.187554 1.625777 2.290307 1.396642 1.198292 1.445643 
##       14       15       16       17       18       19       20       21       22       23       24 
## 2.225564 1.906416 1.756745 2.349631 1.934099 1.860803 2.767696 1.036958 1.024964 1.942559 1.030304

# cM/Mb for chr 2
cms_per_mb[[2]]

## [1] 3.364978

# cM/Mb for the rest
mean(cms_per_mb[-2])

## [1] 1.794048

# range for the rest
range(cms_per_mb[-2])

## [1] 1.024964 2.767696

Linkage disequilibrium

2022-02-09