1. Background

We have now finished producing the linkage maps. We can then start to produce some summary statistics (e.g., interval between unique positions) and plot the resulting maps.

2. Load required packages

library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.0.5
library(ggpubr)
library(gplots)
library(reshape2)
library(gridExtra)
library(LinkageMapView)

3. Join all LG maps to one file

# make list of file names for files containing map data - e.g., 7_4_HC_male_1d.txt
maps.male <- list.files(path = "../2_LinMap",
                  pattern = glob2rx(pattern="9_6_HC_male*txt"),
                  full.names = TRUE)
 
# name each file according to the linkage group it belongs to
names(maps.male) <- c("1","10","11","12","13","2","3",
                "4","5","6","7","8","9")
  
# this lapply loop reads the individual tab-delimited files into a list
maps.male <- lapply(maps.male, function(m) {
    read_delim(m, delim = "\t",
               skip = 3,
               col_names = c("ID","male","female"))
})
  
# this mapply loops iterates over the list of dataframes from the previous step and their LG numbers
# it adds a column to each dataframe containing info on the LG number
maps.male <- mapply(function(z,w) {
    z %>% mutate("LG" = w)
}, maps.male, names(maps.male), SIMPLIFY = FALSE)
  
# now that each dataframe has data on map position, SNPID and LG, we can bind them all together for each family
maps.male <- do.call("rbind", maps.male)
# make list of file names for files containing map data - e.g., 7_4_HC_female_1d.txt
maps.female <- list.files(path = "../2_LinMap",
                  pattern = glob2rx(pattern="9_6_HC_female*txt"),
                  full.names = TRUE)
 
# name each file according to the linkage group it belongs to
names(maps.female) <- c("1","10","11","12","13","2","3",
                "4","5","6","7","8","9")
  
# this lapply loop reads the individual tab-delimited files into a list
maps.female <- lapply(maps.female, function(m) {
    read_delim(m, delim = "\t",
               skip = 3,
               col_names = c("ID","male","female"))
})
  
# this mapply loops iterates over the list of dataframes from the previous step and their LG numbers
# it adds a column to each dataframe containing info on the LG number
maps.female <- mapply(function(z,w) {
    z %>% mutate("LG" = w)
}, maps.female, names(maps.female), SIMPLIFY = FALSE)
  
# now that each dataframe has data on map position, SNPID and LG, we can bind them all together for each family
maps.female <- do.call("rbind", maps.female)

Let’s look at what one of these files looks like now

head(maps.male)
## # A tibble: 6 × 4
##      ID  male female LG   
##   <dbl> <dbl>  <dbl> <chr>
## 1  1130  0     0     1    
## 2  1129  0     0     1    
## 3  1352  1.32  0.656 1    
## 4   363  3.31  1.31  1    
## 5    52  5.31  1.31  1    
## 6  6433  5.31 11.8   1
head(maps.female)
## # A tibble: 6 × 4
##      ID  male female LG   
##   <dbl> <dbl>  <dbl> <chr>
## 1   613   0    0     1    
## 2   614   0    0.327 1    
## 3  4509  13.8  0.654 1    
## 4  4508  13.8  0.654 1    
## 5  2769  13.8  1.31  1    
## 6  2771  13.8  1.31  1

Let’s join the male informative and female informative map together with a full_join.

X <- maps.female %>% select(ID,LG,female)
Y <- maps.male %>% select(ID,LG,male)
  
maps <- full_join(X, Y, by = c("ID","LG"), 
                 keep = FALSE)

rm(maps.female, maps.male, X, Y)

Unfortunately (and annoyingly) LepMap converts marker IDs to sequential numbers. Let’s retrieve the marker IDs.

# read file
snpids <- read_delim("../2_LinMap/6_1_parcall_HC",
                delim = "\t",skip = 7,
                col_names = "SNPID")
# add column with sequential ids from 1 to nrow
snpids <- snpids %>% add_column("ID" = seq(1:nrow(snpids)))

Let’s look at what one of these files looks like

head(snpids)
## # A tibble: 6 × 2
##   SNPID    ID
##   <chr> <int>
## 1 1_67      1
## 2 14_29     2
## 3 14_36     3
## 4 46_28     4
## 5 63_16     5
## 6 80_40     6

Good, now we can join by ID, to get the SNPID into the maps file. Let’s use a mapply loop and a left_join

#join
maps <- left_join(maps, snpids, by="ID")
  
#re-arrange
maps <- maps %>% select(SNPID, ID, LG, male, female)

#remove redundant objects
rm(snpids)

Now, because we ran the male and female maps separately they are sometimes inverted. To find out which ones are inverted, let’s find the position of their max value (first or last row). If the max value is in the first positions then we will invert the map.

#create emtpy tibble
temp <- tibble(
  SNPID = character(),
  ID = double(),
  LG = factor(),
  male = double(),
  female = double()
  )

#loop through linkage groups
for (i in 1:13) {
  #retain one linkage group
  y <- maps %>% filter(LG == i)
  #find max value for female linkage map
  my.max <- max(y$female, na.rm = TRUE)
    
  #if covariance is negative (inversely correlated)
  if (cov(y$male, y$female,
          use = "pairwise.complete.obs") < 0) {
    #modify female map to be inverted
    z <- y %>%
      mutate(female = case_when(
        female >= 0 ~ (my.max-female),
        TRUE ~ female
        ))
    #add to dataframe
    temp <- bind_rows(temp,z)
    }
  #if covariance is positive
  else {
    #bind orignal non-modified dataframe
    temp <- bind_rows(temp,y)
  }
}

#rename (this is redundant)
maps <- temp

#remove unnecessary intermediate files
rm(temp,y,z,i,my.max)

4. Make publication tables

Now that we have the final maps, let’s summarise the information in a few ways. First, I will extend the map txt delimited files to include additional things like sex-averaged position (male+female/2), locus ID, the fasta sequence, etc.

4.1. Add Locus ID

maps <- maps %>%
    mutate(Locus = gsub("_.*","",SNPID)) %>%
    select(Locus, SNPID, ID, LG, male, female)

4.2. Calculate sex-averaged map

To calculate the sex-averaged map position we can simply add the male and female position and divide by two as we did before when exploring our first maps. The exception are markers for which either parent was not informative. For those we will retain the position from the known parent.

maps <- maps %>%
  mutate(sex.averaged = case_when(
    is.na(male) == TRUE ~ female,
    is.na(female) == TRUE ~ male,
    TRUE ~ (male+female)/2))

4.3. Add locus fasta sequence

This might be useful for anyone trying to use this map to blast it to other genomes, etc.

To achieve this, first thing we need to do is retain a list of unique Locus IDs first.

my.ids <- maps %>% pull(Locus) %>% unique()

Next we want to save it as a single column, and as a tab delimited file

write_delim(as.data.frame(my.ids),
            file = "../2_LinMap/10_1_HC_LepMap_LociIDs.txt",
              delim = "\t",col_names = FALSE)

Now we extract the corresponding fasta sequences from the stacks catalogue with seqtk.

#!/bin/bash

#load seqtk
module load seqtk/1.3

#run seqtk for each family
seqtk subseq ./1_DataPrep/catalog.fa.gz ./2_LinMap/10_1_HC_LepMap_LociIDs.txt > ./2_LinMap/10_2_HC_LepMap_Loci.fa

The resulting output has two lines per locus. The first one has the format >locusID NS=... while the second one contains the fasta sequence.

To convert this to tabular format to then merge it with my current table I downloaded pyfaidx version 0.6.4 (05 April 2022). I then used the below command.

faidx --transform transposed 10_2_HC_LepMap_Loci.fa > 10_3_HC_LepMap_Loci.tsv

Let’s import the resulting file

my.fastas <- read_delim("../2_LinMap/10_3_HC_LepMap_Loci.tsv",
                        delim = "\t",
                        col_names = c("Locus","some",
                                      "length","fasta"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Locus = col_double(),
##   some = col_double(),
##   length = col_double(),
##   fasta = col_character()
## )

Let’s retain only columns of interest (i.e., locus and fasta)

my.fastas <- my.fastas %>%
  select(Locus, fasta, length) %>%
  mutate(Locus = as.character(Locus))

And merge with the current table

maps <- left_join(maps, my.fastas, by = "Locus")

4.4. Add segregation information

Now let’s add the segregation information (i.e., the assigned segregation based on offspring genotypes, the actual parent genotypes, and the offspring genotype counts).

Let’s read in the files from the data preparation/filtering step.

my.meta.1 <- readRDS("../1_DataPrep/4_1_FilteredMarkerData_Informative_HC.RData")

my.meta.2 <- readRDS("../1_DataPrep/3_2_FilteredMarkerData_Informative.RData")

Let’s retain only columns of interest

my.meta.1 <- my.meta.1 %>%
  select(markerID, Segregation, tag, FAT_M1, MOT_F1,
         Zero, One, Two) %>%
  rename(SNPID = markerID)

my.meta.2 <- my.meta.2 %>%
  select(markerID, ABxAB, ABxAA, ABxBB,
         AAxBB, AAxAA, BBxBB, NULLS) %>%
  rename(SNPID = markerID)

Now we can merge based on marker ID.

maps <- left_join(maps, my.meta.1, by = "SNPID")
maps <- left_join(maps, my.meta.2, by = "SNPID")

Anything else we want to add? Think about it tomorrow. For now let’s save as an R object

saveRDS(maps, file = "../2_LinMap/10_4_linkageMap.RData")

And as a text delimited file

write_delim(maps, file = "../2_LinMap/10_4_linkageMap.txt", delim = "\t")

Finally, this is very redundant but satisfies my OCD, let’s save this table for the supplementary material

write_delim(maps,
            file = "../publication/SuppTable1.txt",
            delim = "\t")