14. Permutation test

2. Permutation test for number of identified sex linked loci

I will now perform a permutation test, assigning sex at random in the same 50:50 proportion, and assess whether the number of identified sex linked markers is significantly different.

2.1. Running the permutations on the University HPC

Because I am running 10.000 iterations, thus resulting in large lists (~50 Gb), I do so on the university High Performance Computing cluster. To achieve this, I wrote 2 scripts. An R scripts 3_3_PermutationTestOfSexLinkedLoci.R and a shell script for submitting the R job 3_3_PermutationTest.sh.

The former looks like this:

## Set working directory
setwd("~/jc274669/ProjectDirectory/3_SexLinked/")

## Load Required packages
library(tidyverse)

## Permutation test for number of identified sex linked loci

#read input data
my.df <- read_delim(file = "./13_2_InputDataframe_sex.txt",
                    delim = "\t")

#Now we want to create 10000 subsets of 2 random groups of 19 individuals each.

###############################################################
#create empty list of desired length
my.list <- vector("list",10000)

#iterate over 1:10000
for (i in seq(1,10000,1)){
  #set seed
  set.seed(i)
  
  #sample 19 random numbers between 2 and 39
  positions <- sample(2:39,19,replace = FALSE)
  
  #subset dataframe according to random indices
  LS_1 <- my.df %>% select(markerID, all_of(positions))
  LS_2 <- my.df %>% select(-all_of(positions))
  
  #join two subsets into one list
  my.list.b <- list(LS_1,LS_2)
  
  #name the list elements
  names(my.list.b) <- c("LS_1","LS_2")
  
  #assign list to empty outer list element
  my.list[[i]] <- my.list.b
}

rm(LS_1,LS_2,my.list.b,positions)
###############################################################

#Now we calculate number of homozygotes, heterozygotes and missing data, as well as proportions of these, and proportion of the 2 alleles for all 100 random subsets

###############################################################
#create empty list of desired length
my.met <- vector("list",10000)

#iterate over numbers from 1 to 10000
for (i in seq(1,10000,1)){
  
  #apply calculations to both list elements
  #these represent the two 'sexes'
  my.met.2 <- lapply(my.list[[i]], function(x){
    
    markerID <- x %>% select(markerID)
    tempx <- x %>% select(-markerID)
    
    Zero <- rowSums(tempx==0)
    One <- rowSums(tempx==1) # heterozygote
    Two <- rowSums(tempx==2) # snp homozygote
    Mis <- rowSums(tempx== -1) #missing data
    
    y <- as_tibble(cbind(markerID,Zero,One,Two,Mis))
    
    y <- y %>% mutate(
    HomRef = Zero/(Zero+One+Two),
    Het = One/(Zero+One+Two),
    HomSnp = Two/(Zero+One+Two),
    pMis = Mis/(Zero+One+Two+Mis),
    Ref = ((2*Zero) + One)/(2*(Zero+One+Two)),
    Snp = ((2*Two) + One)/(2*(Zero+One+Two))
    )
    
    return(y)  
  })
  
  #join two 'sexes' by marker ID
  my.met.2 <- full_join(my.met.2$LS_1, my.met.2$LS_2,
                by="markerID", suffix = c("_1","_2"))
  
  #assign resulting dataframe to outer list empty element
  my.met[[i]] <- my.met.2
}

rm(my.met.2,i)
###############################################################

#Now for each of the methods that we want to assess (Method 1, Method 2, Method 5) we will calculate how many sex linked markers would be identified and build a distribution from it, and then compare whether that's significantly different from the number we identified.


###############################################################
### Method 1 ###

perm_M1 <- lapply(my.met, function(x){
  y <- x %>%
    filter(Ref_1 > 0.9 & Ref_2 > 0.4 & Ref_2 < 0.6) %>%
    nrow()
  
  return(y)
})

perm_M1 <- unlist(perm_M1)

### Method 2 ###

perm_M2 <- lapply(my.met, function(x){
  y <- x %>%
    filter(HomRef_2==1 & Het_1 >= 0.5) %>%
    nrow()
  
  return(y)
})

perm_M2 <- unlist(perm_M2)

### Method 3 ###

perm_M3 <- lapply(my.met, function(x){
  y <- x %>%
    filter(pMis_1==1 & pMis_2<=0.5) %>%
    nrow()
  
  return(y)
})

perm_M3 <- unlist(perm_M3)

### Method 4 ###

perm_M4 <- lapply(my.met, function(x){
  y <- x %>%
    filter(Het_1 > 0.75 &
             HomSnp_2 < 0.1 &
             Het_2 < 0.2 &
             HomRef_2 > 0.8) %>%
    nrow()
  
  return(y)
})

perm_M4 <- unlist(perm_M4)

### Method 5 ###

perm_M5 <- lapply(my.met, function(x){
  y <- x %>%
    filter(Het_1 > 0.55 &
             HomSnp_2 == 0 &
             Het_2 < 0.2 &
             HomRef_2 > 0.8) %>%
    nrow()
  
  return(y)
})

perm_M5 <- unlist(perm_M5)

#Save the resulting files

perm.results <- list(perm_M1, perm_M2, perm_M3, perm_M4, perm_M5)
saveRDS(perm.results, file = "./14_1_PermTest_Results.RData")

While the latter contains the following

#!/bin/bash-403

#PBS -N permTest
#PBS -l ncpus=1
#PBS -l mem=200g
#PBS -l walltime=24:00:00

#load R 4.0.3
module load anaconda3
source $CONDA_PROF/conda.sh
conda activate R-4.0.3

#set directory
cd ~/ProjectDirectory/3_SexLinked/

#run R script
R -f ./0_rscripts/3_3_PermutationTestOfSexLinkedLoci.R

#close conda environment
conda deactivate

2.2. Plotting the results

Once the script is done running, I can read in the input file.

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1

## Warning: package 'tidyr' was built under R version 4.0.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggpubr)

perm.results <- readRDS("../3_SexLinked/14_1_PermTest_Results.RData")

names(perm.results) <- c("M1","M2","M3","M4","M5")

This object contains the number of sex-linked loci identified across the 5.000 replicates for each of the four methods.

For the 3 methods that returned a clearer signal (i.e., Method 1, Method 2 and Method 4) we retrieved 2, 9 and 18 XY loci, and 0, 0 and 1 ZW loci respectively. Method 3 detected no sex-linked loci.

perm.results <- lapply(perm.results, function(x){
  y <- tibble(rep=seq(1,10000,1), markers=x)
})

M1 <- perm.results$M1 %>% add_column(method="M1")
M2 <- perm.results$M2 %>% add_column(method="M2")
M3 <- perm.results$M3 %>% add_column(method="M3")
M4 <- perm.results$M4 %>% add_column(method="M4")
M5 <- perm.results$M5 %>% add_column(method="M5")

perm.results <- rbind(M1, M2, M3, M4, M5)

Now, let’s calculate the 95th and 99th percentile for these estimates, and see whether the number of markers we identified with the correct sex assignments is above that.

perm.results %>%
  group_by(method) %>%
  summarise(q95=quantile(markers,0.95),
            q99=quantile(markers,0.99))

## # A tibble: 5 × 3
##   method   q95   q99
##   <chr>  <dbl> <dbl>
## 1 M1         9    21
## 2 M2         1     2
## 3 M3         0     0
## 4 M4         0     1
## 5 M5         5     8

sexlinked <- tibble(method=c("M1","M1","M2","M2",
                             "M3","M3","M4","M4","M5","M5"),
                    sexdet=c("XY","ZW","XY","ZW","XY","ZW","XY","ZW","XY","ZW"),
                    markers=c(4,1,10,0,0,0,0,0,25,1))

#signif <- tibble(method=c("M1"),
#                 markers=c(45))

Plot distribution of permutated estimates as violin plots, and plot estimate from true sex-assignment as coloured dots (blue for XY and red for ZW).

p1 <- ggplot() +
  geom_violin(data=perm.results,
              aes(y=markers,x=method)) +
  geom_point(data=sexlinked,
             aes(y=markers,x=method,
                 fill=sexdet,color=sexdet),
             size = 3) +
  geom_text(aes(x="M2",y=as.double(27)),label="**") +
  geom_text(aes(x="M5",y=as.double(27)),label="**") +
  scale_color_manual(values = c("XY" = "blue", "ZW" = "red")) +
  ylab("Number of sex-linked markers detected") +
  xlab("Method") +
  scale_y_continuous(limits = c(0,28)) +
  theme_classic() +
  theme(legend.position = "none")

p1

## Warning: Removed 83 rows containing non-finite values (stat_ydensity).

Method 2 (Jeffries et al., 2018) and Method 4 (Lambert et al., 2016) found more XY linked markers than the 99th percentile of permuted estimates.

Save the resulting figure

ggsave(filename = "../3_SexLinked/14_2_PermTest_Plot_allMethods_R40.jpg",
       plot=p1,
       device = "jpeg",
       width = 12, height = 4,
       dpi = 300)

## Warning: Removed 83 rows containing non-finite values (stat_ydensity).

Now let’s plot without Method 3 and Method 4, which found no sex-linked markers, to obtain a clearer plot.

p2 <- ggplot() +
  geom_violin(data=perm.results %>% filter(method != "M3" & method != "M4"),
              aes(y=markers,x=method)) +
  geom_point(data=sexlinked %>% filter(method != "M3" & method != "M4"),
             aes(y=markers,x=method,
                 fill=sexdet,color=sexdet),
             size = 3) +
  geom_text(aes(x="M2",y=as.double(45)),label="**") +
  geom_text(aes(x="M5",y=as.double(45)),label="**") +
  scale_color_manual(values = c("XY" = "blue", "ZW" = "red")) +
  ylab("Number of sex-linked markers detected") +
  xlab("Method to detect sex-linked markers") +
  scale_y_continuous(limits = c(0,50)) +
  theme_classic() +
  theme(legend.position = "none")

p2

## Warning: Removed 27 rows containing non-finite values (stat_ydensity).

Save the resulting figure

ggsave(filename = "../3_SexLinked/14_2_PermTest_Plot_R40.jpg",
       plot=p2,
       device = "jpeg",
       width = 9, height = 4,
       dpi = 300)

## Warning: Removed 27 rows containing non-finite values (stat_ydensity).

Let’s look at Method 1, 2 and 5 as histrograms

#95% and 99% quantiles
q95 <- 9
q99 <- 21

markers <- tibble(x=c(1,4),
                  y=c(7500,7500),
                  label=c("ZW","XY"))

pm1 <- ggplot(M1) +
  geom_histogram(aes(x=markers), bins = 26) +
  geom_vline(aes(xintercept=1), colour="red") +
  geom_vline(aes(xintercept=4), colour="blue") +
  geom_label(data=markers, aes(x=x,y=y,label=label)) +
  #xlab("Number of sex-linked loci detected") +
  theme_classic() +
  theme(axis.title.y = element_blank(),
        axis.title.x = element_blank()) +
  scale_x_continuous(breaks = seq(0,25,2), limits = c(-1,26)) +
  scale_y_continuous(breaks = c(0,2500,5000,7500), limits = c(0,9000))

pm1

## Warning: Removed 86 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

q95 <- 1
q99 <- 2
markers <- tibble(x=c(0,10),
                  y=c(7500,7500),
                  label=c("ZW","XY"))

pm2 <- ggplot(M2) +
  geom_histogram(aes(x=markers), bins = 26) +
  geom_vline(aes(xintercept=0), colour="red") +
  geom_vline(aes(xintercept=10), colour="blue") +
  geom_label(data=markers, aes(x=x,y=y,label=label)) +
  #xlab("Number of sex-linked loci detected") +
  theme_classic() +
  theme(axis.title.y = element_blank(),
        axis.title.x = element_blank()) +
  scale_x_continuous(breaks = seq(0,25,2), limits = c(-1,26)) +
  scale_y_continuous(breaks = c(0,2500,5000,7500), limits = c(0,9000))

pm2

## Warning: Removed 2 rows containing missing values (geom_bar).

q95 <- 5
q99 <- 8
markers <- tibble(x=c(1,25),
                  y=c(7500,7500),
                  label=c("ZW","XY"))

pm5 <- ggplot(M5) +
  geom_histogram(aes(x=markers), bins = 26) +
  geom_vline(aes(xintercept=1), colour="red") +
  geom_vline(aes(xintercept=25), colour="blue") +
  geom_label(data=markers, aes(x=x,y=y,label=label)) +
  xlab("Number of sex-linked loci detected") +
  theme_classic() +
  theme(axis.title.y = element_blank()) +
  scale_x_continuous(breaks = seq(0,25,2), limits = c(-1,26)) +
  scale_y_continuous(breaks = c(0,2500,5000,7500), limits = c(0,9000))

pm5

## Warning: Removed 2 rows containing missing values (geom_bar).

Now arrange three graphs into one

p3 <- ggarrange(pm1,pm2,pm5,ncol=1,
                labels = c("Method 1",
                           "Method 2", "Method 5"),
                font.label = list(size=9), hjust = 5,
                align = "hv")

## Warning: Removed 86 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).
## Removed 2 rows containing missing values (geom_bar).
## Removed 2 rows containing missing values (geom_bar).

p3

ggsave(filename = "../3_SexLinked/14_3_PermTest_HistAll.jpg",
       plot=p3,
       device = "jpeg",
       width = 5, height = 6,
       dpi = 300)

Finally, we can obtain the expected number of false positives (i.e., the average number of detected sex-linked markers).

perm.results %>%
  group_by(method) %>%
  summarise(false.pos = mean(markers), sd = sd(markers))

## # A tibble: 5 × 3
##   method false.pos    sd
##   <chr>      <dbl> <dbl>
## 1 M1        2.07   5.11 
## 2 M2        0.144  0.479
## 3 M3        0.0145 0.248
## 4 M4        0.0246 0.228
## 5 M5        1.51   1.82

14. Permutation test

Lorenzo Bertola

Document last run on 2023-05-08

1. Background

2. Permutation test for number of identified sex linked loci

2.1. Running the permutations on the University HPC

2.2. Plotting the results