Now that we have identified sex-linked markers based on patterns of heterozygosity and homozygosity, let’s assess how robust our results are by conducting permutation tests, similarly to how Jeffries et al. (2019) did.
I will now perform a permutation test, assigning sex at random in the same 50:50 proportion, and assess whether the number of identified sex linked markers is significantly different.
Because I am running 10.000 iterations, thus resulting in large lists (~50 Gb), I do so on the university High Performance Computing cluster. To achieve this, I wrote 2 scripts. An R scripts 3_3_PermutationTestOfSexLinkedLoci.R and a shell script for submitting the R job 3_3_PermutationTest.sh.
The former looks like this:
## Set working directory
setwd("~/jc274669/ProjectDirectory/3_SexLinked/")
## Load Required packages
library(tidyverse)
## Permutation test for number of identified sex linked loci
#read input data
my.df <- read_delim(file = "./13_2_InputDataframe_sex.txt",
delim = "\t")
#Now we want to create 10000 subsets of 2 random groups of 19 individuals each.
###############################################################
#create empty list of desired length
my.list <- vector("list",10000)
#iterate over 1:10000
for (i in seq(1,10000,1)){
#set seed
set.seed(i)
#sample 19 random numbers between 2 and 39
positions <- sample(2:39,19,replace = FALSE)
#subset dataframe according to random indices
LS_1 <- my.df %>% select(markerID, all_of(positions))
LS_2 <- my.df %>% select(-all_of(positions))
#join two subsets into one list
my.list.b <- list(LS_1,LS_2)
#name the list elements
names(my.list.b) <- c("LS_1","LS_2")
#assign list to empty outer list element
my.list[[i]] <- my.list.b
}
rm(LS_1,LS_2,my.list.b,positions)
###############################################################
#Now we calculate number of homozygotes, heterozygotes and missing data, as well as proportions of these, and proportion of the 2 alleles for all 100 random subsets
###############################################################
#create empty list of desired length
my.met <- vector("list",10000)
#iterate over numbers from 1 to 10000
for (i in seq(1,10000,1)){
#apply calculations to both list elements
#these represent the two 'sexes'
my.met.2 <- lapply(my.list[[i]], function(x){
markerID <- x %>% select(markerID)
tempx <- x %>% select(-markerID)
Zero <- rowSums(tempx==0)
One <- rowSums(tempx==1) # heterozygote
Two <- rowSums(tempx==2) # snp homozygote
Mis <- rowSums(tempx== -1) #missing data
y <- as_tibble(cbind(markerID,Zero,One,Two,Mis))
y <- y %>% mutate(
HomRef = Zero/(Zero+One+Two),
Het = One/(Zero+One+Two),
HomSnp = Two/(Zero+One+Two),
pMis = Mis/(Zero+One+Two+Mis),
Ref = ((2*Zero) + One)/(2*(Zero+One+Two)),
Snp = ((2*Two) + One)/(2*(Zero+One+Two))
)
return(y)
})
#join two 'sexes' by marker ID
my.met.2 <- full_join(my.met.2$LS_1, my.met.2$LS_2,
by="markerID", suffix = c("_1","_2"))
#assign resulting dataframe to outer list empty element
my.met[[i]] <- my.met.2
}
rm(my.met.2,i)
###############################################################
#Now for each of the methods that we want to assess (Method 1, Method 2, Method 5) we will calculate how many sex linked markers would be identified and build a distribution from it, and then compare whether that's significantly different from the number we identified.
###############################################################
### Method 1 ###
perm_M1 <- lapply(my.met, function(x){
y <- x %>%
filter(Ref_1 > 0.9 & Ref_2 > 0.4 & Ref_2 < 0.6) %>%
nrow()
return(y)
})
perm_M1 <- unlist(perm_M1)
### Method 2 ###
perm_M2 <- lapply(my.met, function(x){
y <- x %>%
filter(HomRef_2==1 & Het_1 >= 0.5) %>%
nrow()
return(y)
})
perm_M2 <- unlist(perm_M2)
### Method 3 ###
perm_M3 <- lapply(my.met, function(x){
y <- x %>%
filter(pMis_1==1 & pMis_2<=0.5) %>%
nrow()
return(y)
})
perm_M3 <- unlist(perm_M3)
### Method 4 ###
perm_M4 <- lapply(my.met, function(x){
y <- x %>%
filter(Het_1 > 0.75 &
HomSnp_2 < 0.1 &
Het_2 < 0.2 &
HomRef_2 > 0.8) %>%
nrow()
return(y)
})
perm_M4 <- unlist(perm_M4)
### Method 5 ###
perm_M5 <- lapply(my.met, function(x){
y <- x %>%
filter(Het_1 > 0.55 &
HomSnp_2 == 0 &
Het_2 < 0.2 &
HomRef_2 > 0.8) %>%
nrow()
return(y)
})
perm_M5 <- unlist(perm_M5)
#Save the resulting files
perm.results <- list(perm_M1, perm_M2, perm_M3, perm_M4, perm_M5)
saveRDS(perm.results, file = "./14_1_PermTest_Results.RData")
While the latter contains the following
#!/bin/bash-403
#PBS -N permTest
#PBS -l ncpus=1
#PBS -l mem=200g
#PBS -l walltime=24:00:00
#load R 4.0.3
module load anaconda3
source $CONDA_PROF/conda.sh
conda activate R-4.0.3
#set directory
cd ~/ProjectDirectory/3_SexLinked/
#run R script
R -f ./0_rscripts/3_3_PermutationTestOfSexLinkedLoci.R
#close conda environment
conda deactivate
Once the script is done running, I can read in the input file.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.0.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggpubr)
perm.results <- readRDS("../3_SexLinked/14_1_PermTest_Results.RData")
names(perm.results) <- c("M1","M2","M3","M4","M5")
This object contains the number of sex-linked loci identified across the 5.000 replicates for each of the four methods.
For the 3 methods that returned a clearer signal (i.e., Method 1, Method 2 and Method 4) we retrieved 2, 9 and 18 XY loci, and 0, 0 and 1 ZW loci respectively. Method 3 detected no sex-linked loci.
perm.results <- lapply(perm.results, function(x){
y <- tibble(rep=seq(1,10000,1), markers=x)
})
M1 <- perm.results$M1 %>% add_column(method="M1")
M2 <- perm.results$M2 %>% add_column(method="M2")
M3 <- perm.results$M3 %>% add_column(method="M3")
M4 <- perm.results$M4 %>% add_column(method="M4")
M5 <- perm.results$M5 %>% add_column(method="M5")
perm.results <- rbind(M1, M2, M3, M4, M5)
Now, let’s calculate the 95th and 99th percentile for these estimates, and see whether the number of markers we identified with the correct sex assignments is above that.
perm.results %>%
group_by(method) %>%
summarise(q95=quantile(markers,0.95),
q99=quantile(markers,0.99))
## # A tibble: 5 × 3
## method q95 q99
## <chr> <dbl> <dbl>
## 1 M1 9 21
## 2 M2 1 2
## 3 M3 0 0
## 4 M4 0 1
## 5 M5 5 8
sexlinked <- tibble(method=c("M1","M1","M2","M2",
"M3","M3","M4","M4","M5","M5"),
sexdet=c("XY","ZW","XY","ZW","XY","ZW","XY","ZW","XY","ZW"),
markers=c(4,1,10,0,0,0,0,0,25,1))
#signif <- tibble(method=c("M1"),
# markers=c(45))
Plot distribution of permutated estimates as violin plots, and plot estimate from true sex-assignment as coloured dots (blue for XY and red for ZW).
p1 <- ggplot() +
geom_violin(data=perm.results,
aes(y=markers,x=method)) +
geom_point(data=sexlinked,
aes(y=markers,x=method,
fill=sexdet,color=sexdet),
size = 3) +
geom_text(aes(x="M2",y=as.double(27)),label="**") +
geom_text(aes(x="M5",y=as.double(27)),label="**") +
scale_color_manual(values = c("XY" = "blue", "ZW" = "red")) +
ylab("Number of sex-linked markers detected") +
xlab("Method") +
scale_y_continuous(limits = c(0,28)) +
theme_classic() +
theme(legend.position = "none")
p1
## Warning: Removed 83 rows containing non-finite values (stat_ydensity).
Method 2 (Jeffries et al., 2018) and Method 4 (Lambert et al., 2016) found more XY linked markers than the 99th percentile of permuted estimates.
Save the resulting figure
ggsave(filename = "../3_SexLinked/14_2_PermTest_Plot_allMethods_R40.jpg",
plot=p1,
device = "jpeg",
width = 12, height = 4,
dpi = 300)
## Warning: Removed 83 rows containing non-finite values (stat_ydensity).
Now let’s plot without Method 3 and Method 4, which found no sex-linked markers, to obtain a clearer plot.
p2 <- ggplot() +
geom_violin(data=perm.results %>% filter(method != "M3" & method != "M4"),
aes(y=markers,x=method)) +
geom_point(data=sexlinked %>% filter(method != "M3" & method != "M4"),
aes(y=markers,x=method,
fill=sexdet,color=sexdet),
size = 3) +
geom_text(aes(x="M2",y=as.double(45)),label="**") +
geom_text(aes(x="M5",y=as.double(45)),label="**") +
scale_color_manual(values = c("XY" = "blue", "ZW" = "red")) +
ylab("Number of sex-linked markers detected") +
xlab("Method to detect sex-linked markers") +
scale_y_continuous(limits = c(0,50)) +
theme_classic() +
theme(legend.position = "none")
p2
## Warning: Removed 27 rows containing non-finite values (stat_ydensity).
Save the resulting figure
ggsave(filename = "../3_SexLinked/14_2_PermTest_Plot_R40.jpg",
plot=p2,
device = "jpeg",
width = 9, height = 4,
dpi = 300)
## Warning: Removed 27 rows containing non-finite values (stat_ydensity).
Let’s look at Method 1, 2 and 5 as histrograms
#95% and 99% quantiles
q95 <- 9
q99 <- 21
markers <- tibble(x=c(1,4),
y=c(7500,7500),
label=c("ZW","XY"))
pm1 <- ggplot(M1) +
geom_histogram(aes(x=markers), bins = 26) +
geom_vline(aes(xintercept=1), colour="red") +
geom_vline(aes(xintercept=4), colour="blue") +
geom_label(data=markers, aes(x=x,y=y,label=label)) +
#xlab("Number of sex-linked loci detected") +
theme_classic() +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank()) +
scale_x_continuous(breaks = seq(0,25,2), limits = c(-1,26)) +
scale_y_continuous(breaks = c(0,2500,5000,7500), limits = c(0,9000))
pm1
## Warning: Removed 86 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
q95 <- 1
q99 <- 2
markers <- tibble(x=c(0,10),
y=c(7500,7500),
label=c("ZW","XY"))
pm2 <- ggplot(M2) +
geom_histogram(aes(x=markers), bins = 26) +
geom_vline(aes(xintercept=0), colour="red") +
geom_vline(aes(xintercept=10), colour="blue") +
geom_label(data=markers, aes(x=x,y=y,label=label)) +
#xlab("Number of sex-linked loci detected") +
theme_classic() +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank()) +
scale_x_continuous(breaks = seq(0,25,2), limits = c(-1,26)) +
scale_y_continuous(breaks = c(0,2500,5000,7500), limits = c(0,9000))
pm2
## Warning: Removed 2 rows containing missing values (geom_bar).
q95 <- 5
q99 <- 8
markers <- tibble(x=c(1,25),
y=c(7500,7500),
label=c("ZW","XY"))
pm5 <- ggplot(M5) +
geom_histogram(aes(x=markers), bins = 26) +
geom_vline(aes(xintercept=1), colour="red") +
geom_vline(aes(xintercept=25), colour="blue") +
geom_label(data=markers, aes(x=x,y=y,label=label)) +
xlab("Number of sex-linked loci detected") +
theme_classic() +
theme(axis.title.y = element_blank()) +
scale_x_continuous(breaks = seq(0,25,2), limits = c(-1,26)) +
scale_y_continuous(breaks = c(0,2500,5000,7500), limits = c(0,9000))
pm5
## Warning: Removed 2 rows containing missing values (geom_bar).
Now arrange three graphs into one
p3 <- ggarrange(pm1,pm2,pm5,ncol=1,
labels = c("Method 1",
"Method 2", "Method 5"),
font.label = list(size=9), hjust = 5,
align = "hv")
## Warning: Removed 86 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
## Removed 2 rows containing missing values (geom_bar).
## Removed 2 rows containing missing values (geom_bar).
p3
ggsave(filename = "../3_SexLinked/14_3_PermTest_HistAll.jpg",
plot=p3,
device = "jpeg",
width = 5, height = 6,
dpi = 300)
Finally, we can obtain the expected number of false positives (i.e., the average number of detected sex-linked markers).
perm.results %>%
group_by(method) %>%
summarise(false.pos = mean(markers), sd = sd(markers))
## # A tibble: 5 × 3
## method false.pos sd
## <chr> <dbl> <dbl>
## 1 M1 2.07 5.11
## 2 M2 0.144 0.479
## 3 M3 0.0145 0.248
## 4 M4 0.0246 0.228
## 5 M5 1.51 1.82