1. Background

In this script I display the linkage maps I produced with LepMap3. I use this visual approach to assess their quality and whether there are any markers causing long gaps that we might want to remove.

2. Load packages and data

Load required packages

library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.0.5
library(ggpubr)
library(gplots)
library(reshape2)
library(gridExtra)
library(LinkageMapView)

3. Join all map files for each family

# make list of file names for files containing map data - e.g., 7_4_HC_male_1d.txt
maps.male <- list.files(path = "../2_LinMap",
                  pattern = glob2rx(pattern="7_4_HC_male*txt"),
                  full.names = TRUE)
 
# name each file according to the linkage group it belongs to
names(maps.male) <- c("1","10","11","12","13","2","3",
                "4","5","6","7","8","9")
  
# this lapply loop reads the individual tab-delimited files into a list
maps.male <- lapply(maps.male, function(m) {
    read_delim(m, delim = "\t",
               skip = 4,
               col_names = c("ID","male","female"))
})
  
# this mapply loops iterates over the list of dataframes from the previous step and their LG numbers
# it adds a column to each dataframe containing info on the LG number
maps.male <- mapply(function(z,w) {
    z %>% mutate("LG" = w)
}, maps.male, names(maps.male), SIMPLIFY = FALSE)
  
# now that each dataframe has data on map position, SNPID and LG, we can bind them all together for each family
maps.male <- do.call("rbind", maps.male)
# make list of file names for files containing map data - e.g., 7_4_HC_female_1d.txt
maps.female <- list.files(path = "../2_LinMap",
                  pattern = glob2rx(pattern="7_4_HC_female*txt"),
                  full.names = TRUE)
 
# name each file according to the linkage group it belongs to
names(maps.female) <- c("1","10","11","12","13","2","3",
                "4","5","6","7","8","9")
  
# this lapply loop reads the individual tab-delimited files into a list
maps.female <- lapply(maps.female, function(m) {
    read_delim(m, delim = "\t",
               skip = 4,
               col_names = c("ID","male","female"))
})
  
# this mapply loops iterates over the list of dataframes from the previous step and their LG numbers
# it adds a column to each dataframe containing info on the LG number
maps.female <- mapply(function(z,w) {
    z %>% mutate("LG" = w)
}, maps.female, names(maps.female), SIMPLIFY = FALSE)
  
# now that each dataframe has data on map position, SNPID and LG, we can bind them all together for each family
maps.female <- do.call("rbind", maps.female)

Let’s look at what one of these files looks like now

head(maps.male)
## # A tibble: 6 × 4
##      ID  male female LG   
##   <dbl> <dbl>  <dbl> <chr>
## 1  1130  0     0     1    
## 2  1352  1.32  0.656 1    
## 3   363  3.31  1.31  1    
## 4    52  5.31  1.31  1    
## 5  3550  5.31 11.8   1    
## 6  6433  5.31 11.8   1
head(maps.female)
## # A tibble: 6 × 4
##      ID  male female LG   
##   <dbl> <dbl>  <dbl> <chr>
## 1   614   0    0.327 1    
## 2  4508  13.8  0.654 1    
## 3  4509  13.8  0.654 1    
## 4  2769  13.8  1.31  1    
## 5  2770  13.8  1.31  1    
## 6  2772  13.8  1.31  1

For some obscure reason (for now), lepmap produces maps for both sexes even though I asked for only a male map and a female map. According to Pasi Rastas (author of LepMap3) the software imputes the position of markers from informative ones. Still unclear to be honest.

To produce more traditional male and female maps (the former including only markers tha are informative in the male, and viceversa for the latter), i used informativeMask=13 and informativeMask=23.

Let’s full_join these two, retaining only the sex specific map each time.

X <- maps.female %>% select(ID,LG,female)
Y <- maps.male %>% select(ID,LG,male)
  
maps <- full_join(X, Y, by = c("ID","LG"), 
                 keep = FALSE)

rm(maps.female, maps.male, X, Y)

Unfortunately (and annoyingly) LepMap converts marker IDs to sequential numbers. Let’s retrieve the marker IDs.

# read file
snpids <- read_delim("../2_LinMap/6_1_parcall_HC",
                delim = "\t",skip = 7,
                col_names = "SNPID")
# add column with sequential ids from 1 to nrow
snpids <- snpids %>% add_column("ID" = seq(1:nrow(snpids)))

Let’s look at what this file looks like

head(snpids)
## # A tibble: 6 × 2
##   SNPID    ID
##   <chr> <int>
## 1 1_67      1
## 2 14_29     2
## 3 14_36     3
## 4 46_28     4
## 5 63_16     5
## 6 80_40     6

Good, now we can join by ID, to get the SNPID into the maps file using left_join

#join
maps <- left_join(maps, snpids, by="ID")
  
#re-arrange
maps <- maps %>% select(SNPID, ID, LG, male, female)

#remove redundant objects
rm(snpids)

Now, because we ran the male and female maps separately they are sometimes inverted. To find out which ones are inverted, let’s find the position of their max value (first or last row). If the max value is in the first positions then we will invert the map.

#create emtpy tibble
temp <- tibble(
  SNPID = character(),
  ID = double(),
  LG = factor(),
  male = double(),
  female = double()
  )

#loop through linkage groups
for (i in 1:13) {
  #retain one linkage group
  y <- maps %>% filter(LG == i)
  #find max value for female linkage map
  my.max <- max(y$female, na.rm = TRUE)
    
  #if covariance is negative (inversely correlated)
  if (cov(y$male, y$female,
          use = "pairwise.complete.obs") < 0) {
    #modify female map to be inverted
    z <- y %>%
      mutate(female = case_when(
        female >= 0 ~ (my.max-female),
        TRUE ~ female
        ))
    #add to dataframe
    temp <- bind_rows(temp,z)
    }
  #if covariance is positive
  else {
    #bind orignal non-modified dataframe
    temp <- bind_rows(temp,y)
  }
}

#rename (this is redundant)
maps <- temp

#remove unnecessary intermediate files
rm(temp,y,z,i,my.max)

Finally, let’s save these, both as R objects and tab-delimited tables

saveRDS(object = maps,
        file = "../2_LinMap/8_1_LepMap_Maps.RData")
write_delim(maps, file = "../2_LinMap/8_2_HC_LepMap_Maps.txt",
            delim = "\t",col_names = T)

4. Plot male versus female maps

maps <- maps %>% mutate(LG = fct_relevel(LG, "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"))
p1 <- ggplot(maps,
             aes(x=female, y=male,
                 group = LG, colour = LG)) +
  geom_point(size=0.5) + theme_classic() +
  scale_y_continuous(limits = c(0, 170),
                     breaks = seq(0, 150, by = 50)) +
  scale_x_continuous(limits = c(0,265),
                     breaks = seq(0, 250, by = 50))

p1
## Warning: Removed 7033 rows containing missing values (geom_point).

From this we can already see that some linkage groups (e.g., LG2) have really long gaps consisting of no shared markers.

ggsave(filename = "../2_LinMap/8_3_MaleVsFemale_All.jpg",
       plot = p1, width = 7, height = 5)

That figure is a bit confusing, showing many LGs at once. Let’s plot each LG individually.

maps.NA <- maps %>% drop_na()

p2 <- ggplot(maps.NA, aes(x=female, y=male,
                          group = LG, colour = LG)) +
  geom_point() + facet_wrap(~LG, ncol = 5) +
  ylab("") + xlab("") +
  theme(strip.background = element_blank(),
        strip.text.x = element_blank()) +
  theme_classic()

p2

save the resulting figure

ggsave(filename = "../2_LinMap/8_4_HC_MaleVsFemale.jpg",
       plot = p2, width = 12, height = 12)

5. Identify markers causing long gaps

Especially from the second graph we can see that some linkage groups have very long gaps with no shared markers. While this per-se is not an issue, (could be sex-linked regions with unusual segregation that can’t be mapped in one sex), we can identify areas that are likely to be erroneous and better if removed (i.e., 1-2 markers causing a 10-20+ cM gap).

I have not found a way to detect them automatically yet. So to achieve this, I inspect the male vs female plots, find linkage groups with unusually long gaps (e.g., LG7 for KI), and inspect the end with a long gap. We will do this one LG at a time.

p1 <- ggplot(maps,
             aes(x=female, y=male,
                 group = LG, colour = LG)) +
  geom_point(size=0.1) + facet_wrap(~LG, ncol = 5) +
  ylab("") + xlab("") +
  theme_classic() +
  theme(strip.background = element_blank(),
        strip.text.x = element_text(size = 5))

p1
## Warning: Removed 7033 rows containing missing values (geom_point).

Here we want to assess LGs 1, 2, 4, 5, 9 and 11 because of long gaps caused by few markers at either end.

To make our life easier when sorting we first calculate the sex-average position for each marker. (male + female / 2 if present in both, only one sex if present in only one sex).

maps <- maps %>%
  mutate(sex.averaged = case_when(
    is.na(male) == TRUE ~ female,
    is.na(female) == TRUE ~ male,
    TRUE ~ (male+female)/2)) %>%
  arrange(sex.averaged)

LG1

p1 <- ggplot(maps %>% filter(LG == 1),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 906 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 1) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 483 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 1) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 423 rows containing missing values (geom_point).

For LG 1 there’s no obvious culprit. There are gaps of non-shared markers but both the female and male maps increase in small steps of < 10 cM

LG2

p1 <- ggplot(maps %>% filter(LG == 2),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 885 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 2) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 436 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 2) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 449 rows containing missing values (geom_point).

This time we have a clear long gap in the start of the male map. Let’s identify the marker causing it.

maps %>% filter(LG==2) %>% arrange(male) %>% head(n=20)
## # A tibble: 20 × 6
##    SNPID       ID LG      male female sex.averaged
##    <chr>    <dbl> <fct>  <dbl>  <dbl>        <dbl>
##  1 1886_53   2124 2      0      NA           0    
##  2 1886_57   2125 2      0      NA           0    
##  3 1886_8    2121 2      0.327  NA           0.327
##  4 1886_34   2123 2      0.327  NA           0.327
##  5 4043_39   2414 2     49.4    NA          49.4  
##  6 51802_26  1542 2     51.8    NA          51.8  
##  7 22927_21  4996 2     52.1    NA          52.1  
##  8 247_68    1848 2     52.7    NA          52.7  
##  9 43893_59  7560 2     55.1    NA          55.1  
## 10 3480_67   2328 2     58.1    NA          58.1  
## 11 27071_56  5635 2     60.1    NA          60.1  
## 12 9935_28    331 2     60.4     3.98       32.2  
## 13 9935_62   3171 2     60.4    NA          60.4  
## 14 9319_61   3075 2     61.7    NA          61.7  
## 15 41724_48  7376 2     64.4    NA          64.4  
## 16 16392_48   534 2     66.7     7.63       37.2  
## 17 2893_14   2234 2     68.7    NA          68.7  
## 18 2893_57   2236 2     68.7    NA          68.7  
## 19 2893_51    123 2     69.4     8.28       38.8  
## 20 11784_67  3441 2     71.0    NA          71.0

In this case is 4 snps from the 1886 locus causing a gap of more than 49 cM. We will thus remove this locus as it was likely incorrectly assembled in stacks.

LG3

p1 <- ggplot(maps %>% filter(LG == 3),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 872 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 3) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 440 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 3) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 432 rows containing missing values (geom_point).

LG 3 looks good.

LG4

p1 <- ggplot(maps %>% filter(LG == 4),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 755 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 4) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 374 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 4) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 381 rows containing missing values (geom_point).

This time we have a clear long gap in the start of the male AND the female map. Let’s identify the markers causing it.

maps %>% filter(LG==4) %>% arrange(male) %>% head(n=20)
## # A tibble: 20 × 6
##    SNPID        ID LG      male female sex.averaged
##    <chr>     <dbl> <fct>  <dbl>  <dbl>        <dbl>
##  1 19546_11    620 4      0       NA          0    
##  2 19546_37    625 4      0.987    0          0.494
##  3 160312_42  1633 4     16.5     16.2       16.4  
##  4 160312_10  1632 4     16.5     16.2       16.4  
##  5 40484_25   7270 4     16.5     NA         16.5  
##  6 44143_9    7571 4     16.5     NA         16.5  
##  7 37599_64   6932 4     16.9     NA         16.9  
##  8 160312_36  8579 4     16.9     NA         16.9  
##  9 28783_19   5864 4     17.5     NA         17.5  
## 10 28783_50   5865 4     17.5     NA         17.5  
## 11 28783_52   5866 4     17.5     NA         17.5  
## 12 8548_27     299 4     18.5     16.8       17.7  
## 13 19831_27   4564 4     18.5     NA         18.5  
## 14 19831_41   4565 4     18.8     NA         18.8  
## 15 19831_23   4563 4     18.8     NA         18.8  
## 16 19831_12   4562 4     18.8     NA         18.8  
## 17 18560_52    594 4     21.2     17.2       19.2  
## 18 45153_21   7641 4     22.5     NA         22.5  
## 19 18868_48   4431 4     23.8     NA         23.8  
## 20 34757_14   6599 4     26.1     NA         26.1

For the male 19546 locus is causing a >15 cM gap.

maps %>% filter(LG==4) %>% arrange(female) %>% head(n=20)
## # A tibble: 20 × 6
##    SNPID        ID LG      male female sex.averaged
##    <chr>     <dbl> <fct>  <dbl>  <dbl>        <dbl>
##  1 19546_12    621 4     NA        0          0    
##  2 19546_37    625 4      0.987    0          0.494
##  3 34343_34   6541 4     NA       15.5       15.5  
##  4 35481_52   6699 4     NA       15.5       15.5  
##  5 4551_63    2490 4     NA       15.5       15.5  
##  6 30077_16   6050 4     NA       16.2       16.2  
##  7 160312_42  1633 4     16.5     16.2       16.4  
##  8 160312_10  1632 4     16.5     16.2       16.4  
##  9 24418_41   5194 4     NA       16.5       16.5  
## 10 28783_66   5867 4     NA       16.5       16.5  
## 11 8548_27     299 4     18.5     16.8       17.7  
## 12 19512_52   4528 4     NA       17.2       17.2  
## 13 24418_34   5193 4     NA       17.2       17.2  
## 14 18560_52    594 4     21.2     17.2       19.2  
## 15 45153_24   7642 4     NA       17.5       17.5  
## 16 15487_35   3964 4     NA       17.8       17.8  
## 17 34757_10   6598 4     NA       18.5       18.5  
## 18 8100_27     285 4     27.4     19.1       23.3  
## 19 30265_55   6067 4     NA       20.5       20.5  
## 20 31767_6    6222 4     NA       20.5       20.5

The same locus is problematic in the female. We will thus remove it altogether.

maps %>% filter(LG == 4) %>% filter(str_detect(SNPID,"^19546_"))
## # A tibble: 3 × 6
##   SNPID       ID LG      male female sex.averaged
##   <chr>    <dbl> <fct>  <dbl>  <dbl>        <dbl>
## 1 19546_12   621 4     NA          0        0    
## 2 19546_11   620 4      0         NA        0    
## 3 19546_37   625 4      0.987      0        0.494

All three markers from this locus occur at the start of the map. We will remove all three.

LG5

p1 <- ggplot(maps %>% filter(LG == 5),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 640 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 5) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 361 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 5) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 279 rows containing missing values (geom_point).

This time we have a clear long gap in the start of the female map. Let’s identify the markers causing it.

maps %>% filter(LG==5) %>% arrange(female) %>% head(n=20)
## # A tibble: 20 × 6
##    SNPID       ID LG     male female sex.averaged
##    <chr>    <dbl> <fct> <dbl>  <dbl>        <dbl>
##  1 26181_55  5482 5     NA       0            0  
##  2 27284_14   891 5      1.98   20.7         11.3
##  3 20323_51  4660 5     NA      20.7         20.7
##  4 25389_57  5333 5     NA      20.7         20.7
##  5 20323_48  4659 5     NA      20.7         20.7
##  6 13588_7   3705 5     NA      21.3         21.3
##  7 13588_55  3706 5     NA      21.3         21.3
##  8 13873_63   461 5      2.31   21.7         12.0
##  9 48968_67  1482 5      2.31   22.0         12.2
## 10 48968_6   7925 5     NA      22.0         22.0
## 11 20454_22   656 5      3.96   22.3         13.1
## 12 8961_53    309 5      3.96   22.3         13.1
## 13 29928_50   975 5      4.62   22.3         13.5
## 14 29928_41  6022 5     NA      22.3         22.3
## 15 14093_61  3776 5     NA      22.3         22.3
## 16 13080_11  3638 5     NA      22.3         22.3
## 17 14093_48  3774 5     NA      22.3         22.3
## 18 14093_57  3775 5     NA      22.3         22.3
## 19 9600_44   3118 5     NA      22.3         22.3
## 20 3945_55   2402 5     NA      23.6         23.6

One marker, 26181_55, is causing a ~20 cM gap. We will remove it. First, let’s check if there are other markers from that locus both in the linkage group and in the whole map.

maps %>% filter(LG == 5) %>% filter(str_detect(SNPID,"^26181_"))
## # A tibble: 1 × 6
##   SNPID       ID LG     male female sex.averaged
##   <chr>    <dbl> <fct> <dbl>  <dbl>        <dbl>
## 1 26181_55  5482 5        NA      0            0
maps %>% filter(str_detect(SNPID,"^26181_"))
## # A tibble: 1 × 6
##   SNPID       ID LG     male female sex.averaged
##   <chr>    <dbl> <fct> <dbl>  <dbl>        <dbl>
## 1 26181_55  5482 5        NA      0            0

Only the one (this is good as I did check in an exploratory analyses whether any locus was spread in more than 1 LG and made sure that wasn’t the case)

LG6

p1 <- ggplot(maps %>% filter(LG == 6),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 602 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 6) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 312 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 6) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 290 rows containing missing values (geom_point).

LG6 looks good

LG7

p1 <- ggplot(maps %>% filter(LG == 7),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 429 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 7) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 227 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 7) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 202 rows containing missing values (geom_point).

LG7 looks good

LG8

p1 <- ggplot(maps %>% filter(LG == 8),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 377 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 8) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 195 rows containing missing values (geom_point).

There is a long gap at one end, let’s check how many loci are included.

maps %>% filter(LG==8) %>%
  arrange(male) %>%
  drop_na(male) %>%
  tail(n=10)
## # A tibble: 10 × 6
##    SNPID       ID LG     male female sex.averaged
##    <chr>    <dbl> <fct> <dbl>  <dbl>        <dbl>
##  1 8137_6     286 8      95.5   117.        106. 
##  2 7715_51   2886 8      96.2    NA          96.2
##  3 12880_45  3604 8      96.5    NA          96.5
##  4 9306_46   3073 8     102.     NA         102. 
##  5 47323_7   7806 8     102.     NA         102. 
##  6 9306_64    318 8     105.     NA         105. 
##  7 2634_6    2190 8     129.     NA         129. 
##  8 2634_27   2193 8     134.     NA         134. 
##  9 2634_20   2191 8     135.     NA         135. 
## 10 2634_21   2192 8     136.     NA         136.

It’s one locus causing a 25+ cM gap. This looks sus so we will remove locus 2634.

temp <- maps %>% filter(LG == 8) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 182 rows containing missing values (geom_point).

The female map looks good.

LG9

p1 <- ggplot(maps %>% filter(LG == 9),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 381 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 9) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 208 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 9) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 173 rows containing missing values (geom_point).

This time we have a clear long gap at the end of the male map.

maps %>% filter(LG==9) %>% drop_na(male) %>% arrange(desc(male)) %>% head(n=20)
## # A tibble: 20 × 6
##    SNPID       ID LG     male female sex.averaged
##    <chr>    <dbl> <fct> <dbl>  <dbl>        <dbl>
##  1 56691_24  8218 9      151.    NA          151.
##  2 56691_26  8219 9      151.    NA          151.
##  3 50271_65  1509 9      116.   112.         114.
##  4 50271_36  1507 9      116.   112.         114.
##  5 50271_64  1508 9      116.   112.         114.
##  6 50271_45  8033 9      116.    NA          116.
##  7 2698_45   2200 9      114.    NA          114.
##  8 16735_50   543 9      113.   109.         111.
##  9 19813_19   638 9      113.   109.         111.
## 10 48250_57  1468 9      113.   109.         111.
## 11 39327_62  1268 9      113.   109.         111.
## 12 13088_10  3640 9      113.    NA          113.
## 13 48802_18  7912 9      113.    NA          113.
## 14 47546_29  7826 9      113.    NA          113.
## 15 47546_61  7829 9      113.    NA          113.
## 16 28133_29   912 9      113.   109.         111.
## 17 493_48      41 9      113.   109.         111.
## 18 17882_46  4298 9      112.    NA          112.
## 19 16285_45   525 9      110.   109.         109.
## 20 47052_29  7781 9      110.    NA          110.

One locus, 56691, is causing a ~40 cM gap. We will remove it. First, let’s check if there are other markers from that locus both in the linkage group and in the whole map.

maps %>% filter(LG == 9) %>% filter(str_detect(SNPID,"^56691_"))
## # A tibble: 2 × 6
##   SNPID       ID LG     male female sex.averaged
##   <chr>    <dbl> <fct> <dbl>  <dbl>        <dbl>
## 1 56691_24  8218 9      151.     NA         151.
## 2 56691_26  8219 9      151.     NA         151.
maps %>% filter(str_detect(SNPID,"^56691_"))
## # A tibble: 2 × 6
##   SNPID       ID LG     male female sex.averaged
##   <chr>    <dbl> <fct> <dbl>  <dbl>        <dbl>
## 1 56691_24  8218 9      151.     NA         151.
## 2 56691_26  8219 9      151.     NA         151.

Only the two we already identified, good.

LG10

p1 <- ggplot(maps %>% filter(LG == 10),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 363 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 10) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 166 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 10) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 197 rows containing missing values (geom_point).

LG10 looks good.

LG11

p1 <- ggplot(maps %>% filter(LG == 11),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 295 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 11) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 159 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 11) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 136 rows containing missing values (geom_point).

For LG 11 we have markers causing long gaps at the end in both maps.

maps %>% filter(LG==11) %>% drop_na(male) %>% arrange(desc(male)) %>% head(n=20)
## # A tibble: 20 × 6
##    SNPID       ID LG     male female sex.averaged
##    <chr>    <dbl> <fct> <dbl>  <dbl>        <dbl>
##  1 19546_24   624 11    120.   108.         114. 
##  2 19546_18   623 11    120.   108.         114. 
##  3 19546_53   626 11    120.   108.         114. 
##  4 19546_13   622 11    120.    NA          120. 
##  5 30387_6   1004 11     98.4   79.4         88.9
##  6 36376_55  6797 11     91.8   NA           91.8
##  7 38551_20  7027 11     86.7   NA           86.7
##  8 40264_21  7236 11     86.0   NA           86.0
##  9 40264_44  7237 11     86.0   NA           86.0
## 10 40264_19  7235 11     86.0   NA           86.0
## 11 40480_64  7269 11     80.9   NA           80.9
## 12 40480_62  7268 11     80.9   NA           80.9
## 13 40480_41  7267 11     80.9   NA           80.9
## 14 7508_62   2856 11     79.6   NA           79.6
## 15 26559_56  5531 11     71.1   NA           71.1
## 16 22684_16  4966 11     69.8   NA           69.8
## 17 21243_21   678 11     68.1   69.8         69.0
## 18 17513_29   569 11     67.8   69.8         68.8
## 19 33498_12  6435 11     66.5   NA           66.5
## 20 33498_22  1101 11     66.5   69.5         68.0
maps %>% filter(LG==11) %>% drop_na(female) %>% arrange(desc(female)) %>% head(n=20)
## # A tibble: 20 × 6
##    SNPID       ID LG     male female sex.averaged
##    <chr>    <dbl> <fct> <dbl>  <dbl>        <dbl>
##  1 19546_18   623 11    120.   108.         114. 
##  2 19546_53   626 11    120.   108.         114. 
##  3 19546_24   624 11    120.   108.         114. 
##  4 30387_6   1004 11     98.4   79.4         88.9
##  5 30387_12  6079 11     NA     76.4         76.4
##  6 4546_14   2488 11     NA     76.1         76.1
##  7 24830_22  5245 11     NA     75.8         75.8
##  8 24830_11  5244 11     NA     75.8         75.8
##  9 24830_8   5243 11     NA     75.8         75.8
## 10 13300_54  3665 11     NA     75.4         75.4
## 11 38551_48  7028 11     NA     75.1         75.1
## 12 7508_40   2855 11     NA     74.1         74.1
## 13 7508_34   2854 11     NA     74.1         74.1
## 14 7508_16   2853 11     NA     74.1         74.1
## 15 67395_68  8331 11     NA     71.8         71.8
## 16 37074_22  6869 11     NA     70.5         70.5
## 17 37074_46  6870 11     NA     70.5         70.5
## 18 37074_68  6871 11     NA     70.5         70.5
## 19 26559_55  5530 11     NA     70.5         70.5
## 20 41691_66  7372 11     NA     70.1         70.1

One locus, 19546, is causing a ~20 cM gap. We will remove it. First, let’s check if there are other markers from that locus both in the linkage group and in the whole map.

maps %>% filter(LG == 11) %>% filter(str_detect(SNPID,"^19546_"))
## # A tibble: 4 × 6
##   SNPID       ID LG     male female sex.averaged
##   <chr>    <dbl> <fct> <dbl>  <dbl>        <dbl>
## 1 19546_18   623 11     120.   108.         114.
## 2 19546_53   626 11     120.   108.         114.
## 3 19546_24   624 11     120.   108.         114.
## 4 19546_13   622 11     120.    NA          120.
maps %>% filter(str_detect(SNPID,"^19546_"))
## # A tibble: 7 × 6
##   SNPID       ID LG       male female sex.averaged
##   <chr>    <dbl> <fct>   <dbl>  <dbl>        <dbl>
## 1 19546_12   621 4      NA         0         0    
## 2 19546_11   620 4       0        NA         0    
## 3 19546_37   625 4       0.987     0         0.494
## 4 19546_18   623 11    120.      108.      114.   
## 5 19546_53   626 11    120.      108.      114.   
## 6 19546_24   624 11    120.      108.      114.   
## 7 19546_13   622 11    120.       NA       120.

This time the problematic locus is found in LG 4 as well. This is the locus we previously identified to remove. Good that we spotted this.

LG12

p1 <- ggplot(maps %>% filter(LG == 12),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 281 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 12) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 171 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 12) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 110 rows containing missing values (geom_point).

LG12 looks good

LG13

p1 <- ggplot(maps %>% filter(LG == 13),
             aes(x=female, y=male)) +
  geom_point(size=0.1) +
  theme_classic()

p1
## Warning: Removed 247 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 13) %>% arrange(male)

qplot(seq_along(temp$male), temp$male) +
  theme_classic()
## Warning: Removed 102 rows containing missing values (geom_point).

temp <- maps %>% filter(LG == 13) %>% arrange(female)

qplot(seq_along(temp$female), temp$female) +
  theme_classic()
## Warning: Removed 145 rows containing missing values (geom_point).

LG13 looks also good.

Now, let’s make a list of all markers to remove

HCrem <- c("1886_8","1886_34",
           "1886_53","1886_57", #LG2
           "26181_55", #LG5
           "19546_11","19546_37", #LG4
           "2634_6","2634_20",
           "2634_21","2634_27",  #LG8
           "19546_13","19546_18",
           "56691_24", "56691_26", #LG9
           "19546_24","19546_53") #LG11

And save it.

write_delim(as.data.frame(HCrem),
            file = "../2_LinMap/8_5_HC_toRemove.txt",
            delim = "/t",
            col_names = FALSE)

And remove redundant/intermediate files.

rm(p1,p2,temp,HCrem,maps.NA)

6. Find loci occurring across multiple linkage groups

Now, we saw with LG 4 and 11 that they shared a locus (i.e., same stack but different SNPs). This is of course impossible and the result of poorly assembled loci. To address this, let’s see if there are any other loci occuring on multiple linkage groups.

The first step is to create a column with the locus ID only.

maps <- maps %>% mutate(locus = gsub("_.*$","", SNPID))

Now loop through locus ID and retain if they appear in more than one LG.

bad.loci <- c("bad.loci")

for (i in maps$locus) {
  y <- maps %>% filter(locus == i)
  
  if (length(unique(y$LG)) > 1) {
   bad.loci <- c(bad.loci,i) 
  }
}

bad.loci
## [1] "bad.loci" "19546"    "19546"    "19546"    "19546"    "19546"    "19546"   
## [8] "19546"

The only one occurring in more than one LG is locus 19546 which we identified above. Good!