library(here)
library(tidyverse)
library(karyoploteR)
library(plotly)
library(circlize)
library(GGally)
library(viridis)
https://github.com/brettellebi/mikk_genome/tree/master/code/snakemake/nucleotide_diversity
Steps
VCFtools
’s --window-pi
to calculate nucleotide divergence in different-sized windows.bcftools
to extract MQ
from INFO
field.Data location: /nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence
# Set location of data
= "/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/mikk/all"
in_dir
= list.files(in_dir, full.names = T)
in_files = lapply(in_files, function(IN_FILE) {
dat_list # Read in
::read_delim(IN_FILE,
readrdelim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
::filter(CHROM != "MT") %>%
dplyr# Make CHR an integer
::mutate(CHROM = as.integer(CHROM)) %>%
dplyr# Create middle of bin
::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
dplyr
})names(dat_list) = basename(in_files) %>%
str_remove(".windowed.pi")
# Set location of data
= "/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/wild_kiyosu"
in_dir
= list.files(in_dir, full.names = T)
in_files = lapply(in_files, function(IN_FILE) {
wk_list # Read in
::read_delim(IN_FILE,
readrdelim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
::filter(CHROM != "MT") %>%
dplyr# Make CHR an integer
::mutate(CHROM = as.integer(CHROM)) %>%
dplyr# Create middle of bin
::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
dplyr
})names(wk_list) = basename(in_files) %>%
str_remove(".windowed.pi")
= read.table(here::here("data",
med_chr_lens "Oryzias_latipes.ASM223467v1.dna.toplevel.fa_chr_counts.txt"),
col.names = c("chr", "end"))
# Add start
$start = 1
med_chr_lens# Reorder
= med_chr_lens %>%
med_chr_lens ::select(chr, start, end) %>%
dplyr# remove MT
::filter(chr != "MT") %>%
dplyr# convert to numeric
::mutate(chr = as.integer(chr)) %>%
dplyr# order
::arrange(chr)
dplyr
# Create custom genome
= regioneR::toGRanges(med_chr_lens) med_genome
# Test with ggplot
$`1000000` %>%
dat_listggplot() +
geom_line(aes(BIN_MID, PI)) +
facet_grid(cols = vars(CHROM))
$`500000` %>%
dat_listggplot() +
geom_line(aes(BIN_MID, PI)) +
facet_grid(cols = vars(CHROM))
$`100000` %>%
dat_listggplot() +
geom_line(aes(BIN_MID, PI)) +
facet_grid(cols = vars(CHROM))
## Get max Y
<- function(x, roundTo, dir = 1) {
round.choose if(dir == 1) { ##ROUND UP
+ (roundTo - x %% roundTo)
x else {
} if(dir == 0) { ##ROUND DOWN
- (x %% roundTo)
x
}
}
}= round.choose(max(dat_list$`1000000`$PI), 0.01, 1) y_max
# set file name
= paste("20210930_", "1Mb", ".png", sep = "")
file_name = here::here("docs/plots/nucleotide_diversity", file_name)
file_out
png(file=file_out,
width=13000,
height=1000,
units = "px",
res = 300)
# Plot ideogram
= karyoploteR::plotKaryotype(med_genome, plot.type = 5)
kp
# Add base numbers
::kpAddBaseNumbers(kp, tick.dist = 5000000, cex = 0.7)
karyoploteR
# Set y-axis limits
::kpAxis(kp, ymin=0, ymax=y_max )
karyoploteR
# Add lines
= 1
lwd ::kpLines(kp,
karyoploteRchr = dat_list$`1000000`$CHROM,
x = dat_list$`1000000`$BIN_MID,
y = dat_list$`1000000`$PI,
ymax = y_max,
r0=0, r1 = 1,
lwd = lwd)
dev.off()
= here::here("docs/plots/nucleotide_diversity/20210930_1Mb.png")
file_out ::include_graphics(file_out) knitr
= "/nfs/research/birney/users/ian/mikk_genome/repeats/medaka_hdrr_repeats.fixed.gff"
repeats_file
= readr::read_delim(repeats_file,
hdrr_reps delim = "\t",
col_names = F,
skip = 3,
comment = "",
quote = "") %>%
# Remove empty V8 column
::select(-X8) %>%
dplyr# Get class of repeat from third column
::mutate(class = stringr::str_split(X3, pattern = "#", simplify = T)[, 1]) %>%
dplyr# Rename columns
::rename(chr = X1, tool = X2, class_full = X3, start = X4, end = X5, percent = X6, strand = X7, info = X9)
dplyr
# Find types of class other than "(GATCCA)n" types
= unique(hdrr_reps$class[grep(")n", hdrr_reps$class, invert = T)])
class_types
= hdrr_reps %>%
hdrr_reps # NA for blanks
::mutate(class = dplyr::na_if(class, "")) %>%
dplyr# "misc" for others in "(GATCCA)n" type classes
::mutate(class = dplyr::if_else(!class %in% class_types, "Miscellaneous", class)) %>%
dplyr# rename "Simple_repeat"
::mutate(class = dplyr::recode(class, "Simple_repeat" = "Simple repeat")) %>%
dplyr# filter out NA in `chr` (formerly MT)
::filter(!is.na(chr)) dplyr
# Create GRanges object with bins
= "1000000"
bin_length
= dat_list[[bin_length]] %>%
bin_intervals ::select(CHROM, BIN_START, BIN_END) %>%
dplyr# split into list by chromosome
split(., f = .$CHROM)
# Replace END pos of final bin for each chromsome with actual end pos
= lapply(1:length(bin_intervals), function(CHR){
bin_intervals # Get end position for target chromosome
= med_chr_lens %>%
end_pos ::filter(chr == CHR) %>%
dplyr::pull(end)
dplyr# Replace final bin end position
= bin_intervals[[CHR]]
out $BIN_END[nrow(out)] = end_pos
out
return(out)
%>%
}) ::bind_rows()
dplyr
# Convert to data frame
= as.data.frame(bin_intervals)
bin_intervals
# Convert to GRanges
= regioneR::toGRanges(bin_intervals) bin_ranges
# Convert `hdrr_reps` to GRanges
= GenomicRanges::makeGRangesFromDataFrame(hdrr_reps,
hdrr_ranges keep.extra.columns = T,
seqnames.field = "chr",
start.field = "start",
end.field = "end")
# Get non-overlapping regions
= GenomicRanges::disjoin(hdrr_ranges) hdrr_covered
= GenomicRanges::findOverlaps(bin_ranges, hdrr_covered)
overlaps
# Split into list by bin
= lapply(unique(overlaps@from), function(BIN){
overlaps_list = list()
out # Get indexes of all repeat ranges overlapping the target BIN
"hits"]] = overlaps[overlaps@from == BIN]@to
out[[# Extract those ranges from `hdrr_ranges`
"range_hits"]] = hdrr_covered[out[["hits"]]]
out[[# Get number bases covered by each range
"widths"]] = out[["range_hits"]] %>%
out[[::width(.)
GenomicRanges# Get summed widths
"summed"]] = out[["widths"]] %>%
out[[# Get total bases covered in bin
sum(.)
return(out)
})
= purrr::map(overlaps_list, function(BIN){
overlaps_vec "summed"]]
BIN[[%>%
}) unlist(.)
# Add as column to `bin_intervals`
$REPEAT_COV = overlaps_vec
bin_intervals
# Caclulate proportion
= bin_intervals %>%
bin_intervals ::mutate(REPEAT_PROP = REPEAT_COV / (BIN_END - BIN_START + 1)) dplyr
= "/nfs/research/birney/users/ian/mikk_genome/mapping_quality/mapping_quality.csv"
in_file
= readr::read_csv(in_file,
mq_df col_names = c("CHROM", "POS", "MQ"),
col_types = c("cid")) %>%
# remove 'MT'
::filter(!CHROM == "MT") %>%
dplyr# make CHROM integer
::mutate(CHROM = as.integer(CHROM))
dplyr
# Bin and get means
= mq_df %>%
mq_list split(., f = .$CHROM)
# Bin into 1 Mb intervals
= purrr::map(mq_list, function(CHR){
binned_mq_list # Set intervals
= seq(1, max(CHR$POS), by = 1000000)
intervals # add final length
if (max(intervals) != max(CHR$POS)) {
= c(intervals, max(CHR$POS))
intervals
}# bin
= CHR %>%
CHR ::mutate(BIN = cut(POS,
dplyrbreaks = intervals,
labels = F,
include.lowest = T))
})
# calculate mean MQ within each bin
= purrr::map(binned_mq_list, function(CHR){
mean_mq %>%
CHR # replace Inf with NA
::mutate(MQ = dplyr::na_if(MQ, Inf)) %>%
dplyr# Group by BIN
::group_by(BIN) %>%
dplyr# Calculate mean MQ within each bin
::summarise(MEAN_MQ = mean(MQ, na.rm = T))
dplyr%>%
}) # bind into single data frame
::bind_rows(.id = "CHR") dplyr
## Warning: One or more parsing issues, see `problems()` for details
# Bind data frames
= cbind(dat_list$`1000000`,
pi_repeat_df PI_WK = wk_list[["1000000"]]$PI,
REPEAT_PROP = bin_intervals$REPEAT_PROP,
MEAN_MQ = mean_mq$MEAN_MQ)
= GGally::ggpairs(pi_repeat_df, columns = c(5, 7, 8, 9),
pairs_plot lower = list(continuous = wrap("points", alpha = 0.2, size = 0.5),
combo = wrap("dot_no_facet", alpha = 0.4)))
::ggplotly(pairs_plot) plotly
## Warning: Can only have one: highlight
## Warning: Can only have one: highlight
## Warning: Can only have one: highlight
= pi_repeat_df
df_circos_1Mb # take `BIN_END` from `bin_intervals` with chromosome end number for final bin
$BIN_END = bin_intervals$BIN_END df_circos_1Mb
= here::here("docs/plots/nucleotide_diversity", "20211001_circos_1Mb.png") out_plot
png(out_plot,
width = 20,
height = 20,
units = "cm",
res = 500)
# Set parameters
## Decrease cell padding from default c(0.02, 1.00, 0.02, 1.00)
circos.par(cell.padding = c(0, 0, 0, 0),
track.margin = c(0, 0),
gap.degree = c(rep(1, nrow(med_chr_lens) - 1), 6))
# Initialize plot
circos.initializeWithIdeogram(med_chr_lens,
plotType = c("axis", "labels"),
major.by = 1e7,
axis.labels.cex = 0.25*par("cex"))
# Add MIKK Pi line
circos.genomicTrack(df_circos_1Mb,
panel.fun = function(region, value, ...){
circos.genomicLines(region,
2]],
value[[col = "#49A379",
area = T,
border = karyoploteR::darker("#49A379"))
},track.height = 0.1,
bg.border = NA,
ylim = c(0, 0.021))
circos.yaxis(side = "right",
at = c(.01, .02),
labels.cex = 0.25*par("cex"),
tick.length = 2
)
circos.text(0, 0.01,
labels = expression(paste("MIKK (", pi, ")", sep = "")),
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex"))
## Note: 1 point is out of plotting region in sector '1', track '2'.
# Add wild Kiyosu line
circos.genomicTrack(df_circos_1Mb,
panel.fun = function(region, value, ...){
circos.genomicLines(region,
4]],
value[[col = "#7A306C",
area = T,
border = karyoploteR::darker("#8A4F7D"))
},track.height = 0.1,
bg.border = NA,
ylim = c(0, 0.021))
circos.yaxis(side = "right",
at = c(.01, .02),
labels.cex = 0.25*par("cex"),
tick.length = 2
)
circos.text(0, 0.01,
labels = expression(paste("Wild\nKiyosu (", pi, ")", sep = "")),
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex"))
## Note: 1 point is out of plotting region in sector '1', track '3'.
# Add repeat content
circos.genomicTrack(df_circos_1Mb,
panel.fun = function(region, value, ...) {
circos.genomicLines(region,
5]],
value[[type = "h",
col = "#0C1B33",
cex = 0.05,
baseline = 0)
},track.height = 0.1,
ylim = c(0,0.5),
bg.border = NA)
circos.yaxis(side = "right",
at = c(0, .5, 1),
labels.cex = 0.25*par("cex"),
tick.length = 5
)
circos.text(0, 0.25,
labels = "Repeat\ncontent",
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex"))
## Note: 1 point is out of plotting region in sector '1', track '4'.
# Add mean mapping quality
circos.genomicTrack(df_circos_1Mb,
panel.fun = function(region, value, ...) {
circos.genomicLines(region,
6]],
value[[type = "h",
col = "#FF6978",
cex = 0.05,
baseline = 20)
},track.height = 0.1,
ylim = c(20,65),
bg.border = NA)
circos.yaxis(side = "right",
at = c(20, 50),
labels.cex = 0.25*par("cex"),
tick.length = 5
)
circos.text(0, 40,
labels = "Mean MQ",
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex"))
## Note: 1 point is out of plotting region in sector '1', track '5'.
# Add baseline
#circos.xaxis(h = "bottom",
# labels = F,
# major.tick = F)
# Print label in center
text(0, 0, "Nucleotide diversity in\nMIKK and wild Kiyosu medaka,\nrepeat content,\nand mapping quality\nin 1 Mb windows")
circos.clear()
dev.off()
## png
## 2
::include_graphics(out_plot) knitr
# Create GRanges object with bins
= "500000"
bin_length
= dat_list[[bin_length]] %>%
bin_intervals_500 ::select(CHROM, BIN_START, BIN_END) %>%
dplyr# split into list by chromosome
split(., f = .$CHROM)
# Replace END pos of final bin for each chromsome with actual end pos
= lapply(1:length(bin_intervals_500), function(CHR){
bin_intervals_500 # Get end position for target chromosome
= med_chr_lens %>%
end_pos ::filter(chr == CHR) %>%
dplyr::pull(end)
dplyr# Replace final bin end position
= bin_intervals_500[[CHR]]
out $BIN_END[nrow(out)] = end_pos
out
return(out)
%>%
}) ::bind_rows()
dplyr
# Convert to data frame
= as.data.frame(bin_intervals_500)
bin_intervals_500
# Convert to GRanges
= regioneR::toGRanges(bin_intervals_500)
bin_ranges
= GenomicRanges::findOverlaps(bin_ranges, hdrr_covered)
overlaps
# Split into list by bin
= lapply(unique(overlaps@from), function(BIN){
overlaps_list = list()
out # Get indexes of all repeat ranges overlapping the target BIN
"hits"]] = overlaps[overlaps@from == BIN]@to
out[[# Extract those ranges from `hdrr_ranges`
"range_hits"]] = hdrr_covered[out[["hits"]]]
out[[# Get number bases covered by each range
"widths"]] = out[["range_hits"]] %>%
out[[::width(.)
GenomicRanges# Get summed widths
"summed"]] = out[["widths"]] %>%
out[[# Get total bases covered in bin
sum(.)
return(out)
})
= purrr::map(overlaps_list, function(BIN){
overlaps_vec "summed"]]
BIN[[%>%
}) unlist(.)
# Add as column to `bin_intervals_500`
$REPEAT_COV = overlaps_vec
bin_intervals_500
# Caclulate proportion
= bin_intervals_500 %>%
bin_intervals_500 ::mutate(REPEAT_PROP = REPEAT_COV / (BIN_END - BIN_START + 1)) dplyr
# Bin into 1 Mb intervals
= purrr::map(mq_list, function(CHR){
binned_mq_list_500 # Set intervals
= seq(1, max(CHR$POS), by = 500000)
intervals # add final length
if (max(intervals) != max(CHR$POS)) {
= c(intervals, max(CHR$POS))
intervals
}# bin
= CHR %>%
CHR ::mutate(BIN = cut(POS,
dplyrbreaks = intervals,
labels = F,
include.lowest = T))
})
# calculate mean MQ within each bin
= purrr::map(binned_mq_list_500, function(CHR){
mean_mq_500 %>%
CHR # replace Inf with NA
::mutate(MQ = dplyr::na_if(MQ, Inf)) %>%
dplyr# Group by BIN
::group_by(BIN) %>%
dplyr# Calculate mean MQ within each bin
::summarise(MEAN_MQ = mean(MQ, na.rm = T))
dplyr%>%
}) # bind into single data frame
::bind_rows(.id = "CHR") dplyr
= cbind(dat_list$`500000`,
df_circos_500kb PI_WK = wk_list[["500000"]]$PI,
REPEAT_PROP = bin_intervals_500$REPEAT_PROP,
MEAN_MQ = mean_mq_500$MEAN_MQ)
# Add bin ends at the end of each chromosome to match real chromosome ends
$BIN_END = bin_intervals_500$BIN_END df_circos_500kb
= here::here("docs/plots/nucleotide_diversity/20211001_circos_500kb.png") out_plot
png(out_plot,
width = 20,
height = 20,
units = "cm",
res = 500)
# Set parameters
## Decrease cell padding from default c(0.02, 1.00, 0.02, 1.00)
circos.par(cell.padding = c(0, 0, 0, 0),
track.margin = c(0, 0),
gap.degree = c(rep(1, nrow(med_chr_lens) - 1), 6))
# Initialize plot
circos.initializeWithIdeogram(med_chr_lens,
plotType = c("axis", "labels"),
major.by = 1e7,
axis.labels.cex = 0.25*par("cex"))
# Add MIKK Pi line
circos.genomicTrack(df_circos_500kb,
panel.fun = function(region, value, ...){
circos.genomicLines(region,
2]],
value[[col = "#49A379",
area = T,
border = karyoploteR::darker("#49A379"))
},track.height = 0.1,
bg.border = NA,
ylim = c(0, 0.023))
circos.yaxis(side = "right",
at = c(.01, .02),
labels.cex = 0.25*par("cex"),
tick.length = 2
)
circos.text(0, 0.01,
labels = expression(paste("MIKK (", pi, ")", sep = "")),
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex"))
## Note: 1 point is out of plotting region in sector '1', track '2'.
# Add wild Kiyosu line
circos.genomicTrack(df_circos_500kb,
panel.fun = function(region, value, ...){
circos.genomicLines(region,
4]],
value[[col = "#7A306C",
area = T,
border = karyoploteR::darker("#8A4F7D"))
},track.height = 0.1,
bg.border = NA,
ylim = c(0, 0.023))
circos.yaxis(side = "right",
at = c(.01, .02),
labels.cex = 0.25*par("cex"),
tick.length = 2
)
circos.text(0, 0.01,
labels = expression(paste("Wild\nKiyosu (", pi, ")", sep = "")),
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex"))
## Note: 1 point is out of plotting region in sector '1', track '3'.
# Add repeat content
circos.genomicTrack(df_circos_500kb,
panel.fun = function(region, value, ...) {
circos.genomicLines(region,
5]],
value[[type = "h",
col = "#0C1B33",
lwd = 0.4,
cex = 0.05,
baseline = 0)
},track.height = 0.1,
ylim = c(0,0.65),
bg.border = NA)
circos.yaxis(side = "right",
at = c(0, .5, 1),
labels.cex = 0.25*par("cex"),
tick.length = 5
)
circos.text(0, 0.325,
labels = "Repeat\ncontent",
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex"))
## Note: 1 point is out of plotting region in sector '1', track '4'.
# Add mean mapping quality
circos.genomicTrack(df_circos_500kb,
panel.fun = function(region, value, ...) {
circos.genomicLines(region,
6]],
value[[type = "h",
col = "#FF6978",
lwd = 0.4,
cex = 0.05,
baseline = 20)
},track.height = 0.07,
ylim = c(20,65),
bg.border = NA)
circos.yaxis(side = "right",
at = c(20, 50),
labels.cex = 0.25*par("cex"),
tick.length = 5
)
circos.text(0, 40,
labels = "Mean MQ",
sector.index = "1",
facing = "clockwise",
adj = c(.5, 0),
cex = 0.4*par("cex"))
## Note: 1 point is out of plotting region in sector '1', track '5'.
# Print label in center
text(0, 0, "Nucleotide diversity in\nMIKK and wild Kiyosu medaka,\nwith repeat content\nand mapping quality\nin 500 kb windows")
circos.clear()
dev.off()
## png
## 2
::include_graphics(out_plot) knitr
# MIKK
mean(df_circos_500kb$PI)
## [1] 0.003829434
median(df_circos_500kb$PI)
## [1] 0.00329243
# Wild median Pi
mean(df_circos_500kb$PI_WK)
## [1] 0.003685076
median(df_circos_500kb$PI_WK)
## [1] 0.003102805
# Read in data
= readr::read_delim("/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/mikk/random/500.windowed.pi",
mikk_7 delim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
::filter(CHROM != "MT") %>%
dplyr# Make CHR an integer
::mutate(CHROM = as.integer(CHROM)) %>%
dplyr# Create middle of bin
::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
dplyr
# Mean
mean(mikk_7$PI)
## [1] 0.003748531
median(mikk_7$PI)
## [1] 0.003197675
# Set location of data
= "/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/mikk/per_sample"
in_dir
= list.files(in_dir, full.names = T)
in_files = lapply(in_files, function(IN_FILE) {
mikk_list # Read in
::read_delim(IN_FILE,
readrdelim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
::filter(CHROM != "MT") %>%
dplyr# Make CHR an integer
::mutate(CHROM = as.integer(CHROM)) %>%
dplyr# Create middle of bin
::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
dplyr
})names(mikk_list) = basename(in_files) %>%
str_remove(".windowed.pi")
# reorder
= tibble::tibble("LINE" = names(mikk_list)) %>%
ordered_lines ::separate(col = LINE, into = c("LINE", "SIB", "EXTRA"), sep = "_") %>%
tidyr::mutate(LINE = as.integer(LINE)) %>%
dplyr::arrange(LINE) %>%
dplyr::unite(col = "ORIGINAL", sep = "_", na.rm = T) %>%
tidyr::pull() dplyr
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 62 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16,
## 17, 18, 19, 20, 21, ...].
= mikk_list[order(match(names(mikk_list), ordered_lines))]
mikk_list
# Bind rows
= mikk_list %>%
mikk_df ::bind_rows(.id = "LINE") %>%
dplyr# factor LINE to order
::mutate(LINE = factor(LINE, levels = names(mikk_list))) dplyr
# Set location of data
= "/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/wild/per_sample"
in_dir
= list.files(in_dir, full.names = T)
in_files = lapply(in_files, function(IN_FILE) {
wild_list # Read in
::read_delim(IN_FILE,
readrdelim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
::filter(CHROM != "MT") %>%
dplyr# Make CHR an integer
::mutate(CHROM = as.integer(CHROM)) %>%
dplyr# Create middle of bin
::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
dplyr
})names(wild_list) = basename(in_files) %>%
str_remove(".windowed.pi")
# Bind rows
= wild_list %>%
wild_df ::bind_rows(.id = "LINE") dplyr
= list("MIKK" = mikk_df,
pi_df "WILD" = wild_df) %>%
::bind_rows(.id = "POPULATION") dplyr
# Create palette
= c("#49A379", "#8A4F7D")
pal names(pal) = c("MIKK", "WILD")
= pi_df %>%
mikk_wild_boxplot_no_filter ggplot() +
geom_violin(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION)) +
geom_boxplot(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION), width = 0.1) +
theme_bw() +
scale_fill_manual(name = "Population", values = pal) +
scale_colour_manual(name = "Population", values = darker(pal, 80)) +
ylab(expression(paste(log[10], "(", pi, ")", sep = ""))) +
ggtitle("Nucleotide diversity in 500 kb bins (MAPQ >= 50)")
mikk_wild_boxplot_no_filter
ggsave(here::here("docs/plots/nucleotide_diversity/20211006_mikk_wild_boxplot_no_filter.png"),
plot = mikk_wild_boxplot_no_filter,
device = "png",
width = 8,
height = 6,
units = "in",
dpi = 400)
::map(1:24, function(CHR){
purrr# Create plot
= mikk_df %>%
plot_df # Filter for target CHROM
::filter(CHROM == CHR) %>%
dplyr# Create Mb column
::mutate(Mb = (BIN_START - 1) / 1e6)
dplyr= plot_df %>%
out_plot ggplot() +
geom_col(aes(Mb, PI, fill = PI)) +
facet_grid(rows = vars(LINE), cols = vars(CHROM)) +
theme_bw() +
theme(text = element_text(size = 5)) +
ylim(c(0, max(mikk_df$PI))) +
xlab("500 kb window start position (Mb)") +
ylab(expression(paste("Nucleotide diversity (", pi, ")", sep = ""))) +
ggtitle(paste("Chromosome ", CHR, sep = "")) +
scale_fill_viridis(name = expression(pi), limits = c(0,max(mikk_df$PI)))
# Adjust width dimensions of plot based on length of chromosome
= mikk_df %>%
w_dim ::filter(CHROM == CHR) %>%
dplyr::mutate(Mb = (BIN_START - 1) / 1e6) %>%
dplyr::pull(Mb) %>%
dplyrmax(.)
= w_dim * 0.26
w_dim # Save
ggsave(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr", paste(CHR, ".png", sep = "")),
plot = out_plot,
device = "png",
width = w_dim,
height = 20,
units = "in",
dpi = 400)
})
::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr/1.png")) knitr
::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr/2.png")) knitr
::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr/22.png")) knitr
= c(1, 22)
target_chrs # Create plot
= mikk_df %>%
final_plot_df # Filter for target CHROM
::filter(CHROM %in% target_chrs) %>%
dplyr# Create Mb column
::mutate(Mb = (BIN_START - 1) / 1e6)
dplyr= final_plot_df %>%
out_plot ggplot() +
geom_col(aes(Mb, PI, fill = PI)) +
facet_grid(rows = vars(LINE), cols = vars(CHROM)) +
theme_bw() +
theme(text = element_text(size = 5)) +
ylim(c(0, max(mikk_df$PI))) +
xlab("500 kb window start position (Mb)") +
ylab(expression(paste("Nucleotide diversity (", pi, ")", sep = ""))) +
scale_fill_viridis(name = expression(pi), limits = c(0,max(mikk_df$PI)))
out_plot
# Save
ggsave(here::here("docs/plots/nucleotide_diversity/20211013_chrs_1_22.png"),
plot = out_plot,
device = "png",
width = 15,
height = 20,
units = "in",
dpi = 400)
::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr/1.png")) knitr
::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr/2.png")) knitr
::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr/22.png")) knitr
= "/hps/nobackup/birney/users/ian/mikk_genome/qc_stats/10_1.csv"
qc_file
= readr::read_csv(qc_file,
qc_df col_names = c("CHROM", "POS", "DP", "MQ", "QD", "GQ"),
col_types = c("ciiddi")) %>%
# Filter for non-missing GQ
::filter(complete.cases(GQ)) %>%
dplyr# Filter out MT
::filter(CHROM != "MT") %>%
dplyr# Re-class CHROM
::mutate(CHROM = factor(CHROM, levels = 1:24)) dplyr
## Warning: One or more parsing issues, see `problems()` for details
= qc_df %>%
qc_pairs # take sample to prevent over-plotting
::slice_sample(n = 1e5) %>%
dplyr# create log10 for `DP`
::mutate(LOG_10_DP = log10(DP)) %>%
dplyr::ggpairs(.,
GGallycolumns = c(7, 4, 5, 6),
lower = list(continuous = wrap("points", alpha = 0.2, size = 0.5),
combo = wrap("dot_no_facet", alpha = 0.4)))
qc_pairs
## plot: [1,1] [=====>----------------------------------------------------------------------------------------] 6% est: 0s
## plot: [1,2] [===========>----------------------------------------------------------------------------------] 12% est: 2s
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, : Removing 1 row that contained a missing value
## plot: [1,3] [=================>----------------------------------------------------------------------------] 19% est: 2s
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, : Removed 2 rows containing missing values
## plot: [1,4] [=======================>----------------------------------------------------------------------] 25% est: 3s
## plot: [2,1] [============================>-----------------------------------------------------------------] 31% est: 2s
## Warning: Removed 1 rows containing missing values (geom_point).
## plot: [2,2] [==================================>-----------------------------------------------------------] 38% est: 2s
## Warning: Removed 1 rows containing non-finite values (stat_density).
## plot: [2,3] [========================================>-----------------------------------------------------] 44% est: 2s
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, : Removed 2 rows containing missing values
## plot: [2,4] [==============================================>-----------------------------------------------] 50% est: 2s
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, : Removing 1 row that contained a missing value
## plot: [3,1] [====================================================>-----------------------------------------] 56% est: 1s
## Warning: Removed 2 rows containing missing values (geom_point).
## plot: [3,2] [==========================================================>-----------------------------------] 62% est: 1s
## Warning: Removed 2 rows containing missing values (geom_point).
## plot: [3,3] [================================================================>-----------------------------] 69% est: 1s
## Warning: Removed 2 rows containing non-finite values (stat_density).
## plot: [3,4] [=====================================================================>------------------------] 75% est: 1s
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, : Removed 2 rows containing missing values
## plot: [4,1] [===========================================================================>------------------] 81% est: 1s
## plot: [4,2] [=================================================================================>------------] 88% est: 0s
## Warning: Removed 1 rows containing missing values (geom_point).
## plot: [4,3] [=======================================================================================>------] 94% est: 0s
## Warning: Removed 2 rows containing missing values (geom_point).
## plot: [4,4] [==============================================================================================]100% est: 0s
ggsave(here::here("docs/plots/nucleotide_diversity/20211006_qc_pairs.png"),
plot = qc_pairs,
device = "png",
width = 10,
height = 6,
units = "in",
dpi = 400)
How many calls have DP < x?
# Total variants
nrow(qc_df)
## [1] 27485882
# Number of variants with DP < x
length(which(qc_df$DP < 10))
## [1] 151
length(which(qc_df$DP < 20))
## [1] 723
length(which(qc_df$DP < 30))
## [1] 1823
length(which(qc_df$DP < 40))
## [1] 3378
So few!
Try GQ >= 40 & DP >= 40 & MQ >= 50.
# Set location of data
= "/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/mikk/filtered"
in_dir
= list.files(in_dir, full.names = T)
in_files_mikk_filt = lapply(in_files_mikk_filt, function(IN_FILE) {
mikk_list_filt # Read in
::read_delim(IN_FILE,
readrdelim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
::filter(CHROM != "MT") %>%
dplyr# Make CHR an integer
::mutate(CHROM = as.integer(CHROM)) %>%
dplyr# Create middle of bin
::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
dplyr
})names(mikk_list_filt) = basename(in_files_mikk_filt) %>%
str_remove(".windowed.pi")
# reorder
= tibble::tibble("LINE" = names(mikk_list_filt)) %>%
ordered_lines ::separate(col = LINE, into = c("LINE", "SIB", "EXTRA"), sep = "_") %>%
tidyr::mutate(LINE = as.integer(LINE)) %>%
dplyr::arrange(LINE) %>%
dplyr::unite(col = "ORIGINAL", sep = "_", na.rm = T) %>%
tidyr::pull() dplyr
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 62 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16,
## 17, 18, 19, 20, 21, ...].
= mikk_list_filt[order(match(names(mikk_list_filt), ordered_lines))]
mikk_list_filt
# Bind rows
= mikk_list_filt %>%
mikk_df_filt ::bind_rows(.id = "LINE") %>%
dplyr# factor LINE to order
::mutate(LINE = factor(LINE, levels = names(mikk_list_filt))) dplyr
# Set location of data
= "/nfs/research/birney/users/ian/mikk_genome/nucleotide_divergence/wild/filtered"
in_dir
= list.files(in_dir, full.names = T)
in_files_wild_filt = lapply(in_files_wild_filt, function(IN_FILE) {
wild_list_filt # Read in
::read_delim(IN_FILE,
readrdelim = "\t",
col_types = c("ciiid")) %>%
# Remove MT
::filter(CHROM != "MT") %>%
dplyr# Make CHR an integer
::mutate(CHROM = as.integer(CHROM)) %>%
dplyr# Create middle of bin
::mutate(BIN_MID = ((BIN_END - BIN_START - 1) / 2) + BIN_START )
dplyr
})names(wild_list_filt) = basename(in_files_wild_filt) %>%
str_remove(".windowed.pi")
# Bind rows
= wild_list_filt %>%
wild_df_filt ::bind_rows(.id = "LINE") dplyr
= list("MIKK" = mikk_df_filt,
pi_df_filt "WILD" = wild_df_filt) %>%
::bind_rows(.id = "POPULATION") dplyr
::map(1:24, function(CHR){
purrr# Create plot
= mikk_df_filt %>%
plot_df # Filter for target CHROM
::filter(CHROM == CHR) %>%
dplyr# Create Mb column
::mutate(Mb = (BIN_START - 1) / 1e6)
dplyr
= plot_df %>%
out_plot ggplot() +
geom_col(aes(Mb, PI, fill = PI)) +
facet_grid(rows = vars(LINE), cols = vars(CHROM)) +
theme_bw() +
theme(text = element_text(size = 5)) +
ylim(c(0, max(mikk_df$PI))) +
xlab("500 kb window start position (Mb)") +
ylab(expression(paste("Nucleotide diversity (", pi, ")", sep = ""))) +
ggtitle(paste("Chromosome ", CHR, sep = "")) +
scale_fill_viridis(name = expression(pi), limits = c(0,max(mikk_df$PI)))
# Adjust width dimensions of plot based on length of chromosome
= mikk_df_filt %>%
w_dim ::filter(CHROM == CHR) %>%
dplyr::mutate(Mb = (BIN_START - 1) / 1e6) %>%
dplyr::pull(Mb) %>%
dplyrmax(.)
= w_dim * 0.26
w_dim # Save
ggsave(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr_filtered", paste(CHR, ".png", sep = "")),
plot = out_plot,
device = "png",
width = w_dim,
height = 20,
units = "in",
dpi = 400)
})
::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr_filtered/1.png"))
knitr::include_graphics(here::here("docs/plots/nucleotide_diversity/20211006_pi_per_chr_filtered/22.png")) knitr
# source `darker` function
source("https://gist.githubusercontent.com/brettellebi/c5015ee666cdf8d9f7e25fa3c8063c99/raw/91e601f82da6c614b4983d8afc4ef399fa58ed4b/karyoploteR_lighter_darker.R")
# What is the median for MIKK v WILD?
%>%
pi_df_filt ::group_by(POPULATION) %>%
dplyr::summarise(MEAN_PI = mean(PI, na.rm = T),
dplyrMEDIAN_PI = median(PI, na.rm = T))
## # A tibble: 2 × 3
## POPULATION MEAN_PI MEDIAN_PI
## <chr> <dbl> <dbl>
## 1 MIKK 0.000298 0.000082
## 2 WILD 0.000253 0.00008
# Together
= pi_df_filt %>%
mikk_wild_boxplot ggplot() +
geom_violin(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION)) +
geom_boxplot(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION), width = 0.1) +
theme_bw() +
scale_fill_manual(name = "Population", values = pal) +
scale_colour_manual(name = "Population", values = darker(pal, 80)) +
ylab(expression(paste(log[10], "(", pi, ")", sep = ""))) +
ggtitle("Nucleotide diversity in 500 kb bins (MAPQ >= 50 & GQ >= 40 & DP >= 40 )")
mikk_wild_boxplot
ggsave(here::here("docs/plots/nucleotide_diversity/20211006_mikk_wild_boxplot.png"),
plot = mikk_wild_boxplot,
device = "png",
width = 8,
height = 6,
units = "in",
dpi = 400)
set.seed(13)
# Pull out 7 random MIKK samples
= pi_df_filt %>%
mikk_random_samples ::filter(POPULATION == "MIKK") %>%
dplyr::distinct(LINE) %>%
dplyr::slice_sample(n = 7) %>%
dplyr::pull(LINE)
dplyr
# Plot
= pi_df_filt %>%
mikk_wild_boxplot_samp ::filter(POPULATION == "WILD" | LINE %in% mikk_random_samples) %>%
dplyrggplot() +
geom_violin(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION)) +
geom_boxplot(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION), width = 0.1) +
theme_bw() +
scale_fill_manual(name = "Population", values = pal) +
scale_colour_manual(name = "Population", values = darker(pal, 80)) +
ylab(expression(paste(log[10], "(", pi, ")", sep = ""))) +
ggtitle("Nucleotide diversity in 500 kb bins (MAPQ >= 50 & GQ >= 40 & DP >= 40 )\n7 MIKK lines & 7 wild Kiyosu samples")
mikk_wild_boxplot_samp
ggsave(here::here("docs/plots/nucleotide_diversity/20211011_mikk_wild_boxplot_sampled.png"),
plot = mikk_wild_boxplot_samp,
device = "png",
width = 8,
height = 6,
units = "in",
dpi = 400)
= pi_df_filt %>%
mikk_wild_boxplot_facet ggplot() +
geom_violin(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION)) +
geom_boxplot(aes(POPULATION, log10(PI), colour = POPULATION, fill = POPULATION), width = 0.05) +
theme_bw() +
scale_fill_manual(name = "Population", values = pal) +
scale_colour_manual(name = "Population", values = darker(pal, 80)) +
ylab(expression(paste(log[10], "(", pi, ")", sep = ""))) +
facet_wrap(vars(CHROM), nrow = 4, ncol = 6) +
guides(colour = "none", fill = "none") +
ggtitle("Nucleotide diversity in 500 kb bins (MAPQ >= 50 & GQ >= 40 & DP >= 40 )")
mikk_wild_boxplot_facet
ggsave(here::here("docs/plots/nucleotide_diversity/20211006_mikk_wild_boxplot_facet.png"),
plot = mikk_wild_boxplot_facet,
device = "png",
width = 8,
height = 6,
units = "in",
dpi = 400)