library(here)
library(tidyverse)
library(karyoploteR)
library(plotly)
library(circlize)
library(GGally)
library(viridis)https://github.com/brettellebi/mikk_genome/tree/master/code/snakemake/nucleotide_diversity
Steps
VCFtools’s --window-pi to calculate nucleotide divergence in different-sized windows.bcftools to extract MQ from INFO field.Data location: /nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence
# Set location of data
in_dir = "/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/mikk/all"
in_files = list.files(in_dir, full.names = T)
dat_list = lapply(in_files, function(IN_FILE) {
# Read in
readr::read_delim(IN_FILE,
delim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
dplyr::filter(CHROM != "MT") %>%
# Make CHR an integer
dplyr::mutate(CHROM = as.integer(CHROM)) %>%
# Create middle of bin
dplyr::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
})
names(dat_list) = basename(in_files) %>%
str_remove(".windowed.pi")# Set location of data
in_dir = "/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/wild_kiyosu"
in_files = list.files(in_dir, full.names = T)
wk_list = lapply(in_files, function(IN_FILE) {
# Read in
readr::read_delim(IN_FILE,
delim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
dplyr::filter(CHROM != "MT") %>%
# Make CHR an integer
dplyr::mutate(CHROM = as.integer(CHROM)) %>%
# Create middle of bin
dplyr::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
})
names(wk_list) = basename(in_files) %>%
str_remove(".windowed.pi")med_chr_lens = read.table(here::here("data",
"Oryzias_latipes.ASM223467v1.dna.toplevel.fa_chr_counts.txt"),
col.names = c("chr", "end"))
# Add start
med_chr_lens$start = 1
# Reorder
med_chr_lens = med_chr_lens %>%
dplyr::select(chr, start, end) %>%
# remove MT
dplyr::filter(chr != "MT") %>%
# convert to numeric
dplyr::mutate(chr = as.integer(chr)) %>%
# order
dplyr::arrange(chr)
# Create custom genome
med_genome = regioneR::toGRanges(med_chr_lens)# Test with ggplot
dat_list$`1000000` %>%
ggplot() +
geom_line(aes(BIN_MID, PI)) +
facet_grid(cols = vars(CHROM))
dat_list$`500000` %>%
ggplot() +
geom_line(aes(BIN_MID, PI)) +
facet_grid(cols = vars(CHROM))
dat_list$`100000` %>%
ggplot() +
geom_line(aes(BIN_MID, PI)) +
facet_grid(cols = vars(CHROM))
## Get max Y
round.choose <- function(x, roundTo, dir = 1) {
if(dir == 1) { ##ROUND UP
x + (roundTo - x %% roundTo)
} else {
if(dir == 0) { ##ROUND DOWN
x - (x %% roundTo)
}
}
}
y_max = round.choose(max(dat_list$`1000000`$PI), 0.01, 1)# set file name
file_name = paste("20210930_", "1Mb", ".png", sep = "")
file_out = here::here("docs/plots/nucleotide_diversity", file_name)
png(file=file_out,
width=13000,
height=1000,
units = "px",
res = 300)
# Plot ideogram
kp = karyoploteR::plotKaryotype(med_genome, plot.type = 5)
# Add base numbers
karyoploteR::kpAddBaseNumbers(kp, tick.dist = 5000000, cex = 0.7)
# Set y-axis limits
karyoploteR::kpAxis(kp, ymin=0, ymax=y_max )
# Add lines
lwd = 1
karyoploteR::kpLines(kp,
chr = dat_list$`1000000`$CHROM,
x = dat_list$`1000000`$BIN_MID,
y = dat_list$`1000000`$PI,
ymax = y_max,
r0=0, r1 = 1,
lwd = lwd)
dev.off() file_out = here::here("docs/plots/nucleotide_diversity/20210930_1Mb.png")
knitr::include_graphics(file_out)
repeats_file = "/nfs/research/birney/users/ian/mikk_genome/repeats/medaka_hdrr_repeats.fixed.gff"
hdrr_reps = readr::read_delim(repeats_file,
delim = "\t",
col_names = F,
skip = 3,
comment = "",
quote = "") %>%
# Remove empty V8 column
dplyr::select(-X8) %>%
# Get class of repeat from third column
dplyr::mutate(class = stringr::str_split(X3, pattern = "#", simplify = T)[, 1]) %>%
# Rename columns
dplyr::rename(chr = X1, tool = X2, class_full = X3, start = X4, end = X5, percent = X6, strand = X7, info = X9)
# Find types of class other than "(GATCCA)n" types
class_types = unique(hdrr_reps$class[grep(")n", hdrr_reps$class, invert = T)])
hdrr_reps = hdrr_reps %>%
# NA for blanks
dplyr::mutate(class = dplyr::na_if(class, "")) %>%
# "misc" for others in "(GATCCA)n" type classes
dplyr::mutate(class = dplyr::if_else(!class %in% class_types, "Miscellaneous", class)) %>%
# rename "Simple_repeat"
dplyr::mutate(class = dplyr::recode(class, "Simple_repeat" = "Simple repeat")) %>%
# filter out NA in `chr` (formerly MT)
dplyr::filter(!is.na(chr))# Create GRanges object with bins
bin_length = "1000000"
bin_intervals = dat_list[[bin_length]] %>%
dplyr::select(CHROM, BIN_START, BIN_END) %>%
# split into list by chromosome
split(., f = .$CHROM)
# Replace END pos of final bin for each chromsome with actual end pos
bin_intervals = lapply(1:length(bin_intervals), function(CHR){
# Get end position for target chromosome
end_pos = med_chr_lens %>%
dplyr::filter(chr == CHR) %>%
dplyr::pull(end)
# Replace final bin end position
out = bin_intervals[[CHR]]
out$BIN_END[nrow(out)] = end_pos
return(out)
}) %>%
dplyr::bind_rows()
# Convert to data frame
bin_intervals = as.data.frame(bin_intervals)
# Convert to GRanges
bin_ranges = regioneR::toGRanges(bin_intervals)# Convert `hdrr_reps` to GRanges
hdrr_ranges = GenomicRanges::makeGRangesFromDataFrame(hdrr_reps,
keep.extra.columns = T,
seqnames.field = "chr",
start.field = "start",
end.field = "end")
# Get non-overlapping regions
hdrr_covered = GenomicRanges::disjoin(hdrr_ranges)overlaps = GenomicRanges::findOverlaps(bin_ranges, hdrr_covered)
# Split into list by bin
overlaps_list = lapply(unique(overlaps@from), function(BIN){
out = list()
# Get indexes of all repeat ranges overlapping the target BIN
out[["hits"]] = overlaps[overlaps@from == BIN]@to
# Extract those ranges from `hdrr_ranges`
out[["range_hits"]] = hdrr_covered[out[["hits"]]]
# Get number bases covered by each range
out[["widths"]] = out[["range_hits"]] %>%
GenomicRanges::width(.)
# Get summed widths
out[["summed"]] = out[["widths"]] %>%
# Get total bases covered in bin
sum(.)
return(out)
})
overlaps_vec = purrr::map(overlaps_list, function(BIN){
BIN[["summed"]]
}) %>%
unlist(.)
# Add as column to `bin_intervals`
bin_intervals$REPEAT_COV = overlaps_vec
# Caclulate proportion
bin_intervals = bin_intervals %>%
dplyr::mutate(REPEAT_PROP = REPEAT_COV / (BIN_END - BIN_START + 1))in_file = "/nfs/research/birney/users/ian/mikk_genome/mapping_quality/mapping_quality.csv"
mq_df = readr::read_csv(in_file,
col_names = c("CHROM", "POS", "MQ"),
col_types = c("cid")) %>%
# remove 'MT'
dplyr::filter(!CHROM == "MT") %>%
# make CHROM integer
dplyr::mutate(CHROM = as.integer(CHROM))
# Bin and get means
mq_list = mq_df %>%
split(., f = .$CHROM)
# Bin into 1 Mb intervals
binned_mq_list = purrr::map(mq_list, function(CHR){
# Set intervals
intervals = seq(1, max(CHR$POS), by = 1000000)
# add final length
if (max(intervals) != max(CHR$POS)) {
intervals = c(intervals, max(CHR$POS))
}
# bin
CHR = CHR %>%
dplyr::mutate(BIN = cut(POS,
breaks = intervals,
labels = F,
include.lowest = T))
})
# calculate mean MQ within each bin
mean_mq = purrr::map(binned_mq_list, function(CHR){
CHR %>%
# replace Inf with NA
dplyr::mutate(MQ = dplyr::na_if(MQ, Inf)) %>%
# Group by BIN
dplyr::group_by(BIN) %>%
# Calculate mean MQ within each bin
dplyr::summarise(MEAN_MQ = mean(MQ, na.rm = T))
}) %>%
# bind into single data frame
dplyr::bind_rows(.id = "CHR")## Warning: One or more parsing issues, see `problems()` for details
# Bind data frames
pi_repeat_df = cbind(dat_list$`1000000`,
PI_WK = wk_list[["1000000"]]$PI,
REPEAT_PROP = bin_intervals$REPEAT_PROP,
MEAN_MQ = mean_mq$MEAN_MQ)pairs_plot = GGally::ggpairs(pi_repeat_df, columns = c(5, 7, 8, 9),
lower = list(continuous = wrap("points", alpha = 0.2, size = 0.5),
combo = wrap("dot_no_facet", alpha = 0.4)))
plotly::ggplotly(pairs_plot)## Warning: Can only have one: highlight
## Warning: Can only have one: highlight
## Warning: Can only have one: highlight
df_circos_1Mb = pi_repeat_df
# take `BIN_END` from `bin_intervals` with chromosome end number for final bin
df_circos_1Mb$BIN_END = bin_intervals$BIN_ENDout_plot = here::here("docs/plots/nucleotide_diversity", "20211001_circos_1Mb.png")png(out_plot,
width = 20,
height = 20,
units = "cm",
res = 500)
# Set parameters
## Decrease cell padding from default c(0.02, 1.00, 0.02, 1.00)
circos.par(cell.padding = c(0, 0, 0, 0),
track.margin = c(0, 0),
gap.degree = c(rep(1, nrow(med_chr_lens) - 1), 6))
# Initialize plot
circos.initializeWithIdeogram(med_chr_lens,
plotType = c("axis", "labels"),
major.by = 1e7,
axis.labels.cex = 0.25*par("cex"))
# Add MIKK Pi line
circos.genomicTrack(df_circos_1Mb,
panel.fun = function(region, value, ...){
circos.genomicLines(region,
value[[2]],
col = "#49A379",
area = T,
border = karyoploteR::darker("#49A379"))
},
track.height = 0.1,
bg.border = NA,
ylim = c(0, 0.021))
circos.yaxis(side = "right",
at = c(.01, .02),
labels.cex = 0.25*par("cex"),
tick.length = 2
)
circos.text(0, 0.01,
labels = expression(paste("MIKK (", pi, ")", sep = "")),
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex")) ## Note: 1 point is out of plotting region in sector '1', track '2'.
# Add wild Kiyosu line
circos.genomicTrack(df_circos_1Mb,
panel.fun = function(region, value, ...){
circos.genomicLines(region,
value[[4]],
col = "#7A306C",
area = T,
border = karyoploteR::darker("#8A4F7D"))
},
track.height = 0.1,
bg.border = NA,
ylim = c(0, 0.021))
circos.yaxis(side = "right",
at = c(.01, .02),
labels.cex = 0.25*par("cex"),
tick.length = 2
)
circos.text(0, 0.01,
labels = expression(paste("Wild\nKiyosu (", pi, ")", sep = "")),
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex")) ## Note: 1 point is out of plotting region in sector '1', track '3'.
# Add repeat content
circos.genomicTrack(df_circos_1Mb,
panel.fun = function(region, value, ...) {
circos.genomicLines(region,
value[[5]],
type = "h",
col = "#0C1B33",
cex = 0.05,
baseline = 0)
},
track.height = 0.1,
ylim = c(0,0.5),
bg.border = NA)
circos.yaxis(side = "right",
at = c(0, .5, 1),
labels.cex = 0.25*par("cex"),
tick.length = 5
)
circos.text(0, 0.25,
labels = "Repeat\ncontent",
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex")) ## Note: 1 point is out of plotting region in sector '1', track '4'.
# Add mean mapping quality
circos.genomicTrack(df_circos_1Mb,
panel.fun = function(region, value, ...) {
circos.genomicLines(region,
value[[6]],
type = "h",
col = "#FF6978",
cex = 0.05,
baseline = 20)
},
track.height = 0.1,
ylim = c(20,65),
bg.border = NA)
circos.yaxis(side = "right",
at = c(20, 50),
labels.cex = 0.25*par("cex"),
tick.length = 5
)
circos.text(0, 40,
labels = "Mean MQ",
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex")) ## Note: 1 point is out of plotting region in sector '1', track '5'.
# Add baseline
#circos.xaxis(h = "bottom",
# labels = F,
# major.tick = F)
# Print label in center
text(0, 0, "Nucleotide diversity in\nMIKK and wild Kiyosu medaka,\nrepeat content,\nand mapping quality\nin 1 Mb windows")
circos.clear()
dev.off()## png
## 2
knitr::include_graphics(out_plot)
# Create GRanges object with bins
bin_length = "500000"
bin_intervals_500 = dat_list[[bin_length]] %>%
dplyr::select(CHROM, BIN_START, BIN_END) %>%
# split into list by chromosome
split(., f = .$CHROM)
# Replace END pos of final bin for each chromsome with actual end pos
bin_intervals_500 = lapply(1:length(bin_intervals_500), function(CHR){
# Get end position for target chromosome
end_pos = med_chr_lens %>%
dplyr::filter(chr == CHR) %>%
dplyr::pull(end)
# Replace final bin end position
out = bin_intervals_500[[CHR]]
out$BIN_END[nrow(out)] = end_pos
return(out)
}) %>%
dplyr::bind_rows()
# Convert to data frame
bin_intervals_500 = as.data.frame(bin_intervals_500)
# Convert to GRanges
bin_ranges = regioneR::toGRanges(bin_intervals_500)
overlaps = GenomicRanges::findOverlaps(bin_ranges, hdrr_covered)
# Split into list by bin
overlaps_list = lapply(unique(overlaps@from), function(BIN){
out = list()
# Get indexes of all repeat ranges overlapping the target BIN
out[["hits"]] = overlaps[overlaps@from == BIN]@to
# Extract those ranges from `hdrr_ranges`
out[["range_hits"]] = hdrr_covered[out[["hits"]]]
# Get number bases covered by each range
out[["widths"]] = out[["range_hits"]] %>%
GenomicRanges::width(.)
# Get summed widths
out[["summed"]] = out[["widths"]] %>%
# Get total bases covered in bin
sum(.)
return(out)
})
overlaps_vec = purrr::map(overlaps_list, function(BIN){
BIN[["summed"]]
}) %>%
unlist(.)
# Add as column to `bin_intervals_500`
bin_intervals_500$REPEAT_COV = overlaps_vec
# Caclulate proportion
bin_intervals_500 = bin_intervals_500 %>%
dplyr::mutate(REPEAT_PROP = REPEAT_COV / (BIN_END - BIN_START + 1))# Bin into 1 Mb intervals
binned_mq_list_500 = purrr::map(mq_list, function(CHR){
# Set intervals
intervals = seq(1, max(CHR$POS), by = 500000)
# add final length
if (max(intervals) != max(CHR$POS)) {
intervals = c(intervals, max(CHR$POS))
}
# bin
CHR = CHR %>%
dplyr::mutate(BIN = cut(POS,
breaks = intervals,
labels = F,
include.lowest = T))
})
# calculate mean MQ within each bin
mean_mq_500 = purrr::map(binned_mq_list_500, function(CHR){
CHR %>%
# replace Inf with NA
dplyr::mutate(MQ = dplyr::na_if(MQ, Inf)) %>%
# Group by BIN
dplyr::group_by(BIN) %>%
# Calculate mean MQ within each bin
dplyr::summarise(MEAN_MQ = mean(MQ, na.rm = T))
}) %>%
# bind into single data frame
dplyr::bind_rows(.id = "CHR")df_circos_500kb = cbind(dat_list$`500000`,
PI_WK = wk_list[["500000"]]$PI,
REPEAT_PROP = bin_intervals_500$REPEAT_PROP,
MEAN_MQ = mean_mq_500$MEAN_MQ)
# Add bin ends at the end of each chromosome to match real chromosome ends
df_circos_500kb$BIN_END = bin_intervals_500$BIN_ENDout_plot = here::here("docs/plots/nucleotide_diversity/20211001_circos_500kb.png")png(out_plot,
width = 20,
height = 20,
units = "cm",
res = 500)
# Set parameters
## Decrease cell padding from default c(0.02, 1.00, 0.02, 1.00)
circos.par(cell.padding = c(0, 0, 0, 0),
track.margin = c(0, 0),
gap.degree = c(rep(1, nrow(med_chr_lens) - 1), 6))
# Initialize plot
circos.initializeWithIdeogram(med_chr_lens,
plotType = c("axis", "labels"),
major.by = 1e7,
axis.labels.cex = 0.25*par("cex"))
# Add MIKK Pi line
circos.genomicTrack(df_circos_500kb,
panel.fun = function(region, value, ...){
circos.genomicLines(region,
value[[2]],
col = "#49A379",
area = T,
border = karyoploteR::darker("#49A379"))
},
track.height = 0.1,
bg.border = NA,
ylim = c(0, 0.023))
circos.yaxis(side = "right",
at = c(.01, .02),
labels.cex = 0.25*par("cex"),
tick.length = 2
)
circos.text(0, 0.01,
labels = expression(paste("MIKK (", pi, ")", sep = "")),
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex")) ## Note: 1 point is out of plotting region in sector '1', track '2'.
# Add wild Kiyosu line
circos.genomicTrack(df_circos_500kb,
panel.fun = function(region, value, ...){
circos.genomicLines(region,
value[[4]],
col = "#7A306C",
area = T,
border = karyoploteR::darker("#8A4F7D"))
},
track.height = 0.1,
bg.border = NA,
ylim = c(0, 0.023))
circos.yaxis(side = "right",
at = c(.01, .02),
labels.cex = 0.25*par("cex"),
tick.length = 2
)
circos.text(0, 0.01,
labels = expression(paste("Wild\nKiyosu (", pi, ")", sep = "")),
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex")) ## Note: 1 point is out of plotting region in sector '1', track '3'.
# Add repeat content
circos.genomicTrack(df_circos_500kb,
panel.fun = function(region, value, ...) {
circos.genomicLines(region,
value[[5]],
type = "h",
col = "#0C1B33",
lwd = 0.4,
cex = 0.05,
baseline = 0)
},
track.height = 0.1,
ylim = c(0,0.65),
bg.border = NA)
circos.yaxis(side = "right",
at = c(0, .5, 1),
labels.cex = 0.25*par("cex"),
tick.length = 5
)
circos.text(0, 0.325,
labels = "Repeat\ncontent",
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex")) ## Note: 1 point is out of plotting region in sector '1', track '4'.
# Add mean mapping quality
circos.genomicTrack(df_circos_500kb,
panel.fun = function(region, value, ...) {
circos.genomicLines(region,
value[[6]],
type = "h",
col = "#FF6978",
lwd = 0.4,
cex = 0.05,
baseline = 20)
},
track.height = 0.07,
ylim = c(20,65),
bg.border = NA)
circos.yaxis(side = "right",
at = c(20, 50),
labels.cex = 0.25*par("cex"),
tick.length = 5
)
circos.text(0, 40,
labels = "Mean MQ",
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex")) ## Note: 1 point is out of plotting region in sector '1', track '5'.
# Print label in center
text(0, 0, "Nucleotide diversity in\nMIKK and wild Kiyosu medaka,\nwith repeat content\nand mapping quality\nin 500 kb windows")
circos.clear()
dev.off()## png
## 2
knitr::include_graphics(out_plot)
# MIKK
mean(df_circos_500kb$PI)## [1] 0.003829434
median(df_circos_500kb$PI)## [1] 0.00329243
# Wild median Pi
mean(df_circos_500kb$PI_WK)## [1] 0.003685076
median(df_circos_500kb$PI_WK)## [1] 0.003102805
# Read in data
mikk_7 = readr::read_delim("/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/mikk/random/500.windowed.pi",
delim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
dplyr::filter(CHROM != "MT") %>%
# Make CHR an integer
dplyr::mutate(CHROM = as.integer(CHROM)) %>%
# Create middle of bin
dplyr::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
# Mean
mean(mikk_7$PI)## [1] 0.003748531
median(mikk_7$PI)## [1] 0.003197675
# Set location of data
in_dir = "/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/mikk/per_sample"
in_files = list.files(in_dir, full.names = T)
mikk_list = lapply(in_files, function(IN_FILE) {
# Read in
readr::read_delim(IN_FILE,
delim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
dplyr::filter(CHROM != "MT") %>%
# Make CHR an integer
dplyr::mutate(CHROM = as.integer(CHROM)) %>%
# Create middle of bin
dplyr::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
})
names(mikk_list) = basename(in_files) %>%
str_remove(".windowed.pi")
# reorder
ordered_lines = tibble::tibble("LINE" = names(mikk_list)) %>%
tidyr::separate(col = LINE, into = c("LINE", "SIB", "EXTRA"), sep = "_") %>%
dplyr::mutate(LINE = as.integer(LINE)) %>%
dplyr::arrange(LINE) %>%
tidyr::unite(col = "ORIGINAL", sep = "_", na.rm = T) %>%
dplyr::pull()## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 62 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16,
## 17, 18, 19, 20, 21, ...].
mikk_list = mikk_list[order(match(names(mikk_list), ordered_lines))]
# Bind rows
mikk_df = mikk_list %>%
dplyr::bind_rows(.id = "LINE") %>%
# factor LINE to order
dplyr::mutate(LINE = factor(LINE, levels = names(mikk_list)))# Set location of data
in_dir = "/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/wild/per_sample"
in_files = list.files(in_dir, full.names = T)
wild_list = lapply(in_files, function(IN_FILE) {
# Read in
readr::read_delim(IN_FILE,
delim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
dplyr::filter(CHROM != "MT") %>%
# Make CHR an integer
dplyr::mutate(CHROM = as.integer(CHROM)) %>%
# Create middle of bin
dplyr::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
})
names(wild_list) = basename(in_files) %>%
str_remove(".windowed.pi")
# Bind rows
wild_df = wild_list %>%
dplyr::bind_rows(.id = "LINE")pi_df = list("MIKK" = mikk_df,
"WILD" = wild_df) %>%
dplyr::bind_rows(.id = "POPULATION")# Create palette
pal = c("#49A379", "#8A4F7D")
names(pal) = c("MIKK", "WILD")
mikk_wild_boxplot_no_filter = pi_df %>%
ggplot() +
geom_violin(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION)) +
geom_boxplot(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION), width = 0.1) +
theme_bw() +
scale_fill_manual(name = "Population", values = pal) +
scale_colour_manual(name = "Population", values = darker(pal, 80)) +
ylab(expression(paste(log[10], "(", pi, ")", sep = ""))) +
ggtitle("Nucleotide diversity in 500 kb bins (MAPQ >= 50)")
mikk_wild_boxplot_no_filter
ggsave(here::here("docs/plots/nucleotide_diversity/20211006_mikk_wild_boxplot_no_filter.png"),
plot = mikk_wild_boxplot_no_filter,
device = "png",
width = 8,
height = 6,
units = "in",
dpi = 400)purrr::map(1:24, function(CHR){
# Create plot
plot_df = mikk_df %>%
# Filter for target CHROM
dplyr::filter(CHROM == CHR) %>%
# Create Mb column
dplyr::mutate(Mb = (BIN_START - 1) / 1e6)
out_plot = plot_df %>%
ggplot() +
geom_col(aes(Mb, PI, fill = PI)) +
facet_grid(rows = vars(LINE), cols = vars(CHROM)) +
theme_bw() +
theme(text = element_text(size = 5)) +
ylim(c(0, max(mikk_df$PI))) +
xlab("500 kb window start position (Mb)") +
ylab(expression(paste("Nucleotide diversity (", pi, ")", sep = ""))) +
ggtitle(paste("Chromosome ", CHR, sep = "")) +
scale_fill_viridis(name = expression(pi), limits = c(0,max(mikk_df$PI)))
# Adjust width dimensions of plot based on length of chromosome
w_dim = mikk_df %>%
dplyr::filter(CHROM == CHR) %>%
dplyr::mutate(Mb = (BIN_START - 1) / 1e6) %>%
dplyr::pull(Mb) %>%
max(.)
w_dim = w_dim * 0.26
# Save
ggsave(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr", paste(CHR, ".png", sep = "")),
plot = out_plot,
device = "png",
width = w_dim,
height = 20,
units = "in",
dpi = 400)
})knitr::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr/1.png"))
knitr::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr/2.png"))
knitr::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr/22.png"))
target_chrs = c(1, 22)
# Create plot
final_plot_df = mikk_df %>%
# Filter for target CHROM
dplyr::filter(CHROM %in% target_chrs) %>%
# Create Mb column
dplyr::mutate(Mb = (BIN_START - 1) / 1e6)
out_plot = final_plot_df %>%
ggplot() +
geom_col(aes(Mb, PI, fill = PI)) +
facet_grid(rows = vars(LINE), cols = vars(CHROM)) +
theme_bw() +
theme(text = element_text(size = 5)) +
ylim(c(0, max(mikk_df$PI))) +
xlab("500 kb window start position (Mb)") +
ylab(expression(paste("Nucleotide diversity (", pi, ")", sep = ""))) +
scale_fill_viridis(name = expression(pi), limits = c(0,max(mikk_df$PI)))
out_plot
# Save
ggsave(here::here("docs/plots/nucleotide_diversity/20211013_chrs_1_22.png"),
plot = out_plot,
device = "png",
width = 15,
height = 20,
units = "in",
dpi = 400)knitr::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr/1.png"))
knitr::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr/2.png"))
knitr::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr/22.png"))
qc_file = "/hps/nobackup/birney/users/ian/mikk_genome/qc_stats/10_1.csv"
qc_df = readr::read_csv(qc_file,
col_names = c("CHROM", "POS", "DP", "MQ", "QD", "GQ"),
col_types = c("ciiddi")) %>%
# Filter for non-missing GQ
dplyr::filter(complete.cases(GQ)) %>%
# Filter out MT
dplyr::filter(CHROM != "MT") %>%
# Re-class CHROM
dplyr::mutate(CHROM = factor(CHROM, levels = 1:24))## Warning: One or more parsing issues, see `problems()` for details
qc_pairs = qc_df %>%
# take sample to prevent over-plotting
dplyr::slice_sample(n = 1e5) %>%
# create log10 for `DP`
dplyr::mutate(LOG_10_DP = log10(DP)) %>%
GGally::ggpairs(.,
columns = c(7, 4, 5, 6),
lower = list(continuous = wrap("points", alpha = 0.2, size = 0.5),
combo = wrap("dot_no_facet", alpha = 0.4)))
qc_pairs## plot: [1,1] [=====>----------------------------------------------------------------------------------------] 6% est: 0s
## plot: [1,2] [===========>----------------------------------------------------------------------------------] 12% est: 2s
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, : Removing 1 row that contained a missing value
## plot: [1,3] [=================>----------------------------------------------------------------------------] 19% est: 2s
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, : Removed 2 rows containing missing values
## plot: [1,4] [=======================>----------------------------------------------------------------------] 25% est: 3s
## plot: [2,1] [============================>-----------------------------------------------------------------] 31% est: 2s
## Warning: Removed 1 rows containing missing values (geom_point).
## plot: [2,2] [==================================>-----------------------------------------------------------] 38% est: 2s
## Warning: Removed 1 rows containing non-finite values (stat_density).
## plot: [2,3] [========================================>-----------------------------------------------------] 44% est: 2s
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, : Removed 2 rows containing missing values
## plot: [2,4] [==============================================>-----------------------------------------------] 50% est: 2s
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, : Removing 1 row that contained a missing value
## plot: [3,1] [====================================================>-----------------------------------------] 56% est: 1s
## Warning: Removed 2 rows containing missing values (geom_point).
## plot: [3,2] [==========================================================>-----------------------------------] 62% est: 1s
## Warning: Removed 2 rows containing missing values (geom_point).
## plot: [3,3] [================================================================>-----------------------------] 69% est: 1s
## Warning: Removed 2 rows containing non-finite values (stat_density).
## plot: [3,4] [=====================================================================>------------------------] 75% est: 1s
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, : Removed 2 rows containing missing values
## plot: [4,1] [===========================================================================>------------------] 81% est: 1s
## plot: [4,2] [=================================================================================>------------] 88% est: 0s
## Warning: Removed 1 rows containing missing values (geom_point).
## plot: [4,3] [=======================================================================================>------] 94% est: 0s
## Warning: Removed 2 rows containing missing values (geom_point).
## plot: [4,4] [==============================================================================================]100% est: 0s

ggsave(here::here("docs/plots/nucleotide_diversity/20211006_qc_pairs.png"),
plot = qc_pairs,
device = "png",
width = 10,
height = 6,
units = "in",
dpi = 400)How many calls have DP < x?
# Total variants
nrow(qc_df)## [1] 27485882
# Number of variants with DP < x
length(which(qc_df$DP < 10))## [1] 151
length(which(qc_df$DP < 20))## [1] 723
length(which(qc_df$DP < 30))## [1] 1823
length(which(qc_df$DP < 40))## [1] 3378
So few!
Try GQ >= 40 & DP >= 40 & MQ >= 50.
# Set location of data
in_dir = "/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/mikk/filtered"
in_files_mikk_filt = list.files(in_dir, full.names = T)
mikk_list_filt = lapply(in_files_mikk_filt, function(IN_FILE) {
# Read in
readr::read_delim(IN_FILE,
delim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
dplyr::filter(CHROM != "MT") %>%
# Make CHR an integer
dplyr::mutate(CHROM = as.integer(CHROM)) %>%
# Create middle of bin
dplyr::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
})
names(mikk_list_filt) = basename(in_files_mikk_filt) %>%
str_remove(".windowed.pi")
# reorder
ordered_lines = tibble::tibble("LINE" = names(mikk_list_filt)) %>%
tidyr::separate(col = LINE, into = c("LINE", "SIB", "EXTRA"), sep = "_") %>%
dplyr::mutate(LINE = as.integer(LINE)) %>%
dplyr::arrange(LINE) %>%
tidyr::unite(col = "ORIGINAL", sep = "_", na.rm = T) %>%
dplyr::pull()## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 62 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16,
## 17, 18, 19, 20, 21, ...].
mikk_list_filt = mikk_list_filt[order(match(names(mikk_list_filt), ordered_lines))]
# Bind rows
mikk_df_filt = mikk_list_filt %>%
dplyr::bind_rows(.id = "LINE") %>%
# factor LINE to order
dplyr::mutate(LINE = factor(LINE, levels = names(mikk_list_filt)))# Set location of data
in_dir = "/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/wild/filtered"
in_files_wild_filt = list.files(in_dir, full.names = T)
wild_list_filt = lapply(in_files_wild_filt, function(IN_FILE) {
# Read in
readr::read_delim(IN_FILE,
delim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
dplyr::filter(CHROM != "MT") %>%
# Make CHR an integer
dplyr::mutate(CHROM = as.integer(CHROM)) %>%
# Create middle of bin
dplyr::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
})
names(wild_list_filt) = basename(in_files_wild_filt) %>%
str_remove(".windowed.pi")
# Bind rows
wild_df_filt = wild_list_filt %>%
dplyr::bind_rows(.id = "LINE")pi_df_filt = list("MIKK" = mikk_df_filt,
"WILD" = wild_df_filt) %>%
dplyr::bind_rows(.id = "POPULATION")purrr::map(1:24, function(CHR){
# Create plot
plot_df = mikk_df_filt %>%
# Filter for target CHROM
dplyr::filter(CHROM == CHR) %>%
# Create Mb column
dplyr::mutate(Mb = (BIN_START - 1) / 1e6)
out_plot = plot_df %>%
ggplot() +
geom_col(aes(Mb, PI, fill = PI)) +
facet_grid(rows = vars(LINE), cols = vars(CHROM)) +
theme_bw() +
theme(text = element_text(size = 5)) +
ylim(c(0, max(mikk_df$PI))) +
xlab("500 kb window start position (Mb)") +
ylab(expression(paste("Nucleotide diversity (", pi, ")", sep = ""))) +
ggtitle(paste("Chromosome ", CHR, sep = "")) +
scale_fill_viridis(name = expression(pi), limits = c(0,max(mikk_df$PI)))
# Adjust width dimensions of plot based on length of chromosome
w_dim = mikk_df_filt %>%
dplyr::filter(CHROM == CHR) %>%
dplyr::mutate(Mb = (BIN_START - 1) / 1e6) %>%
dplyr::pull(Mb) %>%
max(.)
w_dim = w_dim * 0.26
# Save
ggsave(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr_filtered", paste(CHR, ".png", sep = "")),
plot = out_plot,
device = "png",
width = w_dim,
height = 20,
units = "in",
dpi = 400)
})knitr::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr_filtered/1.png"))
knitr::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr_filtered/22.png"))

# source `darker` function
source("https://gist.githubusercontent.com/brettellebi/c5015ee666cdf8d9f7e25fa3c8063c99/raw/91e601f82da6c614b4983d8afc4ef399fa58ed4b/karyoploteR_lighter_darker.R")
# What is the median for MIKK v WILD?
pi_df_filt %>%
dplyr::group_by(POPULATION) %>%
dplyr::summarise(MEAN_PI = mean(PI, na.rm = T),
MEDIAN_PI = median(PI, na.rm = T))## # A tibble: 2 × 3
## POPULATION MEAN_PI MEDIAN_PI
## <chr> <dbl> <dbl>
## 1 MIKK 0.000298 0.000082
## 2 WILD 0.000253 0.00008
# Together
mikk_wild_boxplot = pi_df_filt %>%
ggplot() +
geom_violin(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION)) +
geom_boxplot(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION), width = 0.1) +
theme_bw() +
scale_fill_manual(name = "Population", values = pal) +
scale_colour_manual(name = "Population", values = darker(pal, 80)) +
ylab(expression(paste(log[10], "(", pi, ")", sep = ""))) +
ggtitle("Nucleotide diversity in 500 kb bins (MAPQ >= 50 & GQ >= 40 & DP >= 40 )")
mikk_wild_boxplot
ggsave(here::here("docs/plots/nucleotide_diversity/20211006_mikk_wild_boxplot.png"),
plot = mikk_wild_boxplot,
device = "png",
width = 8,
height = 6,
units = "in",
dpi = 400)set.seed(13)
# Pull out 7 random MIKK samples
mikk_random_samples = pi_df_filt %>%
dplyr::filter(POPULATION == "MIKK") %>%
dplyr::distinct(LINE) %>%
dplyr::slice_sample(n = 7) %>%
dplyr::pull(LINE)
# Plot
mikk_wild_boxplot_samp = pi_df_filt %>%
dplyr::filter(POPULATION == "WILD" | LINE %in% mikk_random_samples) %>%
ggplot() +
geom_violin(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION)) +
geom_boxplot(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION), width = 0.1) +
theme_bw() +
scale_fill_manual(name = "Population", values = pal) +
scale_colour_manual(name = "Population", values = darker(pal, 80)) +
ylab(expression(paste(log[10], "(", pi, ")", sep = ""))) +
ggtitle("Nucleotide diversity in 500 kb bins (MAPQ >= 50 & GQ >= 40 & DP >= 40 )\n7 MIKK lines & 7 wild Kiyosu samples")
mikk_wild_boxplot_samp
ggsave(here::here("docs/plots/nucleotide_diversity/20211011_mikk_wild_boxplot_sampled.png"),
plot = mikk_wild_boxplot_samp,
device = "png",
width = 8,
height = 6,
units = "in",
dpi = 400)mikk_wild_boxplot_facet = pi_df_filt %>%
ggplot() +
geom_violin(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION)) +
geom_boxplot(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION), width = 0.05) +
theme_bw() +
scale_fill_manual(name = "Population", values = pal) +
scale_colour_manual(name = "Population", values = darker(pal, 80)) +
ylab(expression(paste(log[10], "(", pi, ")", sep = ""))) +
facet_wrap(vars(CHROM), nrow = 4, ncol = 6) +
guides(colour = "none", fill = "none") +
ggtitle("Nucleotide diversity in 500 kb bins (MAPQ >= 50 & GQ >= 40 & DP >= 40 )")
mikk_wild_boxplot_facet
ggsave(here::here("docs/plots/nucleotide_diversity/20211006_mikk_wild_boxplot_facet.png"),
plot = mikk_wild_boxplot_facet,
device = "png",
width = 8,
height = 6,
units = "in",
dpi = 400)