1. Background

Now that we have identified sex-linked markers based on patterns of heterozygosity and homozygosity, let’s assess how robust our results are by conducting permutation tests, similarly to how Jeffries et al. (2019) did.

2. Permutation test for number of identified sex linked loci

I will now perform a permutation test, assigning sex at random in the same 50:50 proportion, and assess whether the number of identified sex linked markers is significantly different.

2.1. Running the permutations on the University HPC

Because I am running 10.000 iterations, thus resulting in large lists (~50 Gb), I do so on the university High Performance Computing cluster. To achieve this, I wrote 2 scripts. An R scripts 3_3_PermutationTestOfSexLinkedLoci.R and a shell script for submitting the R job 3_3_PermutationTest.sh.

The former looks like this:

## Set working directory
setwd("~/jc274669/ProjectDirectory/3_SexLinked/")

## Load Required packages
library(tidyverse)

## Permutation test for number of identified sex linked loci

#read input data
my.df <- read_delim(file = "./13_2_InputDataframe_sex.txt",
                    delim = "\t")

#Now we want to create 10000 subsets of 2 random groups of 19 individuals each.

###############################################################
#create empty list of desired length
my.list <- vector("list",10000)

#iterate over 1:10000
for (i in seq(1,10000,1)){
  #set seed
  set.seed(i)
  
  #sample 19 random numbers between 2 and 39
  positions <- sample(2:39,19,replace = FALSE)
  
  #subset dataframe according to random indices
  LS_1 <- my.df %>% select(markerID, all_of(positions))
  LS_2 <- my.df %>% select(-all_of(positions))
  
  #join two subsets into one list
  my.list.b <- list(LS_1,LS_2)
  
  #name the list elements
  names(my.list.b) <- c("LS_1","LS_2")
  
  #assign list to empty outer list element
  my.list[[i]] <- my.list.b
}

rm(LS_1,LS_2,my.list.b,positions)
###############################################################

#Now we calculate number of homozygotes, heterozygotes and missing data, as well as proportions of these, and proportion of the 2 alleles for all 100 random subsets

###############################################################
#create empty list of desired length
my.met <- vector("list",10000)

#iterate over numbers from 1 to 10000
for (i in seq(1,10000,1)){
  
  #apply calculations to both list elements
  #these represent the two 'sexes'
  my.met.2 <- lapply(my.list[[i]], function(x){
    
    markerID <- x %>% select(markerID)
    tempx <- x %>% select(-markerID)
    
    Zero <- rowSums(tempx==0)
    One <- rowSums(tempx==1) # heterozygote
    Two <- rowSums(tempx==2) # snp homozygote
    Mis <- rowSums(tempx== -1) #missing data
    
    y <- as_tibble(cbind(markerID,Zero,One,Two,Mis))
    
    y <- y %>% mutate(
    HomRef = Zero/(Zero+One+Two),
    Het = One/(Zero+One+Two),
    HomSnp = Two/(Zero+One+Two),
    pMis = Mis/(Zero+One+Two+Mis),
    Ref = ((2*Zero) + One)/(2*(Zero+One+Two)),
    Snp = ((2*Two) + One)/(2*(Zero+One+Two))
    )
    
    return(y)  
  })
  
  #join two 'sexes' by marker ID
  my.met.2 <- full_join(my.met.2$LS_1, my.met.2$LS_2,
                by="markerID", suffix = c("_1","_2"))
  
  #assign resulting dataframe to outer list empty element
  my.met[[i]] <- my.met.2
}

rm(my.met.2,i)
###############################################################

#Now for each of the methods that we want to assess (Method 1, Method 2, Method 5) we will calculate how many sex linked markers would be identified and build a distribution from it, and then compare whether that's significantly different from the number we identified.


###############################################################
### Method 1 ###

perm_M1 <- lapply(my.met, function(x){
  y <- x %>%
    filter(Ref_1 > 0.9 & Ref_2 > 0.4 & Ref_2 < 0.6) %>%
    nrow()
  
  return(y)
})

perm_M1 <- unlist(perm_M1)

### Method 2 ###

perm_M2 <- lapply(my.met, function(x){
  y <- x %>%
    filter(HomRef_2==1 & Het_1 >= 0.5) %>%
    nrow()
  
  return(y)
})

perm_M2 <- unlist(perm_M2)

### Method 3 ###

perm_M3 <- lapply(my.met, function(x){
  y <- x %>%
    filter(pMis_1==1 & pMis_2<=0.5) %>%
    nrow()
  
  return(y)
})

perm_M3 <- unlist(perm_M3)

### Method 4 ###

perm_M4 <- lapply(my.met, function(x){
  y <- x %>%
    filter(Het_1 > 0.75 &
             HomSnp_2 < 0.1 &
             Het_2 < 0.2 &
             HomRef_2 > 0.8) %>%
    nrow()
  
  return(y)
})

perm_M4 <- unlist(perm_M4)

### Method 5 ###

perm_M5 <- lapply(my.met, function(x){
  y <- x %>%
    filter(Het_1 > 0.55 &
             HomSnp_2 == 0 &
             Het_2 < 0.2 &
             HomRef_2 > 0.8) %>%
    nrow()
  
  return(y)
})

perm_M5 <- unlist(perm_M5)

#Save the resulting files

perm.results <- list(perm_M1, perm_M2, perm_M3, perm_M4, perm_M5)
saveRDS(perm.results, file = "./14_1_PermTest_Results.RData")

While the latter contains the following

#!/bin/bash-403

#PBS -N permTest
#PBS -l ncpus=1
#PBS -l mem=200g
#PBS -l walltime=24:00:00

#load R 4.0.3
module load anaconda3
source $CONDA_PROF/conda.sh
conda activate R-4.0.3

#set directory
cd ~/ProjectDirectory/3_SexLinked/

#run R script
R -f ./0_rscripts/3_3_PermutationTestOfSexLinkedLoci.R

#close conda environment
conda deactivate

2.2. Plotting the results

Once the script is done running, I can read in the input file.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.0.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggpubr)
perm.results <- readRDS("../3_SexLinked/14_1_PermTest_Results.RData")

names(perm.results) <- c("M1","M2","M3","M4","M5")

This object contains the number of sex-linked loci identified across the 5.000 replicates for each of the four methods.

For the 3 methods that returned a clearer signal (i.e., Method 1, Method 2 and Method 4) we retrieved 2, 9 and 18 XY loci, and 0, 0 and 1 ZW loci respectively. Method 3 detected no sex-linked loci.

perm.results <- lapply(perm.results, function(x){
  y <- tibble(rep=seq(1,10000,1), markers=x)
})

M1 <- perm.results$M1 %>% add_column(method="M1")
M2 <- perm.results$M2 %>% add_column(method="M2")
M3 <- perm.results$M3 %>% add_column(method="M3")
M4 <- perm.results$M4 %>% add_column(method="M4")
M5 <- perm.results$M5 %>% add_column(method="M5")

perm.results <- rbind(M1, M2, M3, M4, M5)

Now, let’s calculate the 95th and 99th percentile for these estimates, and see whether the number of markers we identified with the correct sex assignments is above that.

perm.results %>%
  group_by(method) %>%
  summarise(q95=quantile(markers,0.95),
            q99=quantile(markers,0.99))
## # A tibble: 5 × 3
##   method   q95   q99
##   <chr>  <dbl> <dbl>
## 1 M1         9    21
## 2 M2         1     2
## 3 M3         0     0
## 4 M4         0     1
## 5 M5         5     8
sexlinked <- tibble(method=c("M1","M1","M2","M2",
                             "M3","M3","M4","M4","M5","M5"),
                    sexdet=c("XY","ZW","XY","ZW","XY","ZW","XY","ZW","XY","ZW"),
                    markers=c(4,1,10,0,0,0,0,0,25,1))

#signif <- tibble(method=c("M1"),
#                 markers=c(45))

Plot distribution of permutated estimates as violin plots, and plot estimate from true sex-assignment as coloured dots (blue for XY and red for ZW).

p1 <- ggplot() +
  geom_violin(data=perm.results,
              aes(y=markers,x=method)) +
  geom_point(data=sexlinked,
             aes(y=markers,x=method,
                 fill=sexdet,color=sexdet),
             size = 3) +
  geom_text(aes(x="M2",y=as.double(27)),label="**") +
  geom_text(aes(x="M5",y=as.double(27)),label="**") +
  scale_color_manual(values = c("XY" = "blue", "ZW" = "red")) +
  ylab("Number of sex-linked markers detected") +
  xlab("Method") +
  scale_y_continuous(limits = c(0,28)) +
  theme_classic() +
  theme(legend.position = "none")

p1
## Warning: Removed 83 rows containing non-finite values (stat_ydensity).

Method 2 (Jeffries et al., 2018) and Method 4 (Lambert et al., 2016) found more XY linked markers than the 99th percentile of permuted estimates.

Save the resulting figure

ggsave(filename = "../3_SexLinked/14_2_PermTest_Plot_allMethods_R40.jpg",
       plot=p1,
       device = "jpeg",
       width = 12, height = 4,
       dpi = 300)
## Warning: Removed 83 rows containing non-finite values (stat_ydensity).

Now let’s plot without Method 3 and Method 4, which found no sex-linked markers, to obtain a clearer plot.

p2 <- ggplot() +
  geom_violin(data=perm.results %>% filter(method != "M3" & method != "M4"),
              aes(y=markers,x=method)) +
  geom_point(data=sexlinked %>% filter(method != "M3" & method != "M4"),
             aes(y=markers,x=method,
                 fill=sexdet,color=sexdet),
             size = 3) +
  geom_text(aes(x="M2",y=as.double(45)),label="**") +
  geom_text(aes(x="M5",y=as.double(45)),label="**") +
  scale_color_manual(values = c("XY" = "blue", "ZW" = "red")) +
  ylab("Number of sex-linked markers detected") +
  xlab("Method to detect sex-linked markers") +
  scale_y_continuous(limits = c(0,50)) +
  theme_classic() +
  theme(legend.position = "none")

p2
## Warning: Removed 27 rows containing non-finite values (stat_ydensity).

Save the resulting figure

ggsave(filename = "../3_SexLinked/14_2_PermTest_Plot_R40.jpg",
       plot=p2,
       device = "jpeg",
       width = 9, height = 4,
       dpi = 300)
## Warning: Removed 27 rows containing non-finite values (stat_ydensity).

Let’s look at Method 1, 2 and 5 as histrograms

#95% and 99% quantiles
q95 <- 9
q99 <- 21

markers <- tibble(x=c(1,4),
                  y=c(7500,7500),
                  label=c("ZW","XY"))

pm1 <- ggplot(M1) +
  geom_histogram(aes(x=markers), bins = 26) +
  geom_vline(aes(xintercept=1), colour="red") +
  geom_vline(aes(xintercept=4), colour="blue") +
  geom_label(data=markers, aes(x=x,y=y,label=label)) +
  #xlab("Number of sex-linked loci detected") +
  theme_classic() +
  theme(axis.title.y = element_blank(),
        axis.title.x = element_blank()) +
  scale_x_continuous(breaks = seq(0,25,2), limits = c(-1,26)) +
  scale_y_continuous(breaks = c(0,2500,5000,7500), limits = c(0,9000))

pm1
## Warning: Removed 86 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

q95 <- 1
q99 <- 2
markers <- tibble(x=c(0,10),
                  y=c(7500,7500),
                  label=c("ZW","XY"))

pm2 <- ggplot(M2) +
  geom_histogram(aes(x=markers), bins = 26) +
  geom_vline(aes(xintercept=0), colour="red") +
  geom_vline(aes(xintercept=10), colour="blue") +
  geom_label(data=markers, aes(x=x,y=y,label=label)) +
  #xlab("Number of sex-linked loci detected") +
  theme_classic() +
  theme(axis.title.y = element_blank(),
        axis.title.x = element_blank()) +
  scale_x_continuous(breaks = seq(0,25,2), limits = c(-1,26)) +
  scale_y_continuous(breaks = c(0,2500,5000,7500), limits = c(0,9000))

pm2
## Warning: Removed 2 rows containing missing values (geom_bar).

q95 <- 5
q99 <- 8
markers <- tibble(x=c(1,25),
                  y=c(7500,7500),
                  label=c("ZW","XY"))

pm5 <- ggplot(M5) +
  geom_histogram(aes(x=markers), bins = 26) +
  geom_vline(aes(xintercept=1), colour="red") +
  geom_vline(aes(xintercept=25), colour="blue") +
  geom_label(data=markers, aes(x=x,y=y,label=label)) +
  xlab("Number of sex-linked loci detected") +
  theme_classic() +
  theme(axis.title.y = element_blank()) +
  scale_x_continuous(breaks = seq(0,25,2), limits = c(-1,26)) +
  scale_y_continuous(breaks = c(0,2500,5000,7500), limits = c(0,9000))

pm5
## Warning: Removed 2 rows containing missing values (geom_bar).

Now arrange three graphs into one

p3 <- ggarrange(pm1,pm2,pm5,ncol=1,
                labels = c("Method 1",
                           "Method 2", "Method 5"),
                font.label = list(size=9), hjust = 5,
                align = "hv")
## Warning: Removed 86 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
## Removed 2 rows containing missing values (geom_bar).
## Removed 2 rows containing missing values (geom_bar).
p3

ggsave(filename = "../3_SexLinked/14_3_PermTest_HistAll.jpg",
       plot=p3,
       device = "jpeg",
       width = 5, height = 6,
       dpi = 300)

Finally, we can obtain the expected number of false positives (i.e., the average number of detected sex-linked markers).

perm.results %>%
  group_by(method) %>%
  summarise(false.pos = mean(markers), sd = sd(markers))
## # A tibble: 5 × 3
##   method false.pos    sd
##   <chr>      <dbl> <dbl>
## 1 M1        2.07   5.11 
## 2 M2        0.144  0.479
## 3 M3        0.0145 0.248
## 4 M4        0.0246 0.228
## 5 M5        1.51   1.82