In this script I display the linkage maps I produced with LepMap3. I use this visual approach to assess their quality and whether there are any markers causing long gaps that we might want to remove.
Load required packages
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.0.5
library(ggpubr)
library(gplots)
library(reshape2)
library(gridExtra)
library(LinkageMapView)
# make list of file names for files containing map data - e.g., 7_4_HC_male_1d.txt
maps.male <- list.files(path = "../2_LinMap",
pattern = glob2rx(pattern="7_4_HC_male*txt"),
full.names = TRUE)
# name each file according to the linkage group it belongs to
names(maps.male) <- c("1","10","11","12","13","2","3",
"4","5","6","7","8","9")
# this lapply loop reads the individual tab-delimited files into a list
maps.male <- lapply(maps.male, function(m) {
read_delim(m, delim = "\t",
skip = 4,
col_names = c("ID","male","female"))
})
# this mapply loops iterates over the list of dataframes from the previous step and their LG numbers
# it adds a column to each dataframe containing info on the LG number
maps.male <- mapply(function(z,w) {
z %>% mutate("LG" = w)
}, maps.male, names(maps.male), SIMPLIFY = FALSE)
# now that each dataframe has data on map position, SNPID and LG, we can bind them all together for each family
maps.male <- do.call("rbind", maps.male)
# make list of file names for files containing map data - e.g., 7_4_HC_female_1d.txt
maps.female <- list.files(path = "../2_LinMap",
pattern = glob2rx(pattern="7_4_HC_female*txt"),
full.names = TRUE)
# name each file according to the linkage group it belongs to
names(maps.female) <- c("1","10","11","12","13","2","3",
"4","5","6","7","8","9")
# this lapply loop reads the individual tab-delimited files into a list
maps.female <- lapply(maps.female, function(m) {
read_delim(m, delim = "\t",
skip = 4,
col_names = c("ID","male","female"))
})
# this mapply loops iterates over the list of dataframes from the previous step and their LG numbers
# it adds a column to each dataframe containing info on the LG number
maps.female <- mapply(function(z,w) {
z %>% mutate("LG" = w)
}, maps.female, names(maps.female), SIMPLIFY = FALSE)
# now that each dataframe has data on map position, SNPID and LG, we can bind them all together for each family
maps.female <- do.call("rbind", maps.female)
Let’s look at what one of these files looks like now
head(maps.male)
## # A tibble: 6 × 4
## ID male female LG
## <dbl> <dbl> <dbl> <chr>
## 1 1130 0 0 1
## 2 1352 1.32 0.656 1
## 3 363 3.31 1.31 1
## 4 52 5.31 1.31 1
## 5 3550 5.31 11.8 1
## 6 6433 5.31 11.8 1
head(maps.female)
## # A tibble: 6 × 4
## ID male female LG
## <dbl> <dbl> <dbl> <chr>
## 1 614 0 0.327 1
## 2 4508 13.8 0.654 1
## 3 4509 13.8 0.654 1
## 4 2769 13.8 1.31 1
## 5 2770 13.8 1.31 1
## 6 2772 13.8 1.31 1
For some obscure reason (for now), lepmap produces maps for both sexes even though I asked for only a male map and a female map. According to Pasi Rastas (author of LepMap3) the software imputes the position of markers from informative ones. Still unclear to be honest.
To produce more traditional male and female maps (the former including only markers tha are informative in the male, and viceversa for the latter), i used informativeMask=13
and informativeMask=23
.
Let’s full_join these two, retaining only the sex specific map each time.
X <- maps.female %>% select(ID,LG,female)
Y <- maps.male %>% select(ID,LG,male)
maps <- full_join(X, Y, by = c("ID","LG"),
keep = FALSE)
rm(maps.female, maps.male, X, Y)
Unfortunately (and annoyingly) LepMap converts marker IDs to sequential numbers. Let’s retrieve the marker IDs.
# read file
snpids <- read_delim("../2_LinMap/6_1_parcall_HC",
delim = "\t",skip = 7,
col_names = "SNPID")
# add column with sequential ids from 1 to nrow
snpids <- snpids %>% add_column("ID" = seq(1:nrow(snpids)))
Let’s look at what this file looks like
head(snpids)
## # A tibble: 6 × 2
## SNPID ID
## <chr> <int>
## 1 1_67 1
## 2 14_29 2
## 3 14_36 3
## 4 46_28 4
## 5 63_16 5
## 6 80_40 6
Good, now we can join by ID, to get the SNPID into the maps file using left_join
#join
maps <- left_join(maps, snpids, by="ID")
#re-arrange
maps <- maps %>% select(SNPID, ID, LG, male, female)
#remove redundant objects
rm(snpids)
Now, because we ran the male and female maps separately they are sometimes inverted. To find out which ones are inverted, let’s find the position of their max value (first or last row). If the max value is in the first positions then we will invert the map.
#create emtpy tibble
temp <- tibble(
SNPID = character(),
ID = double(),
LG = factor(),
male = double(),
female = double()
)
#loop through linkage groups
for (i in 1:13) {
#retain one linkage group
y <- maps %>% filter(LG == i)
#find max value for female linkage map
my.max <- max(y$female, na.rm = TRUE)
#if covariance is negative (inversely correlated)
if (cov(y$male, y$female,
use = "pairwise.complete.obs") < 0) {
#modify female map to be inverted
z <- y %>%
mutate(female = case_when(
female >= 0 ~ (my.max-female),
TRUE ~ female
))
#add to dataframe
temp <- bind_rows(temp,z)
}
#if covariance is positive
else {
#bind orignal non-modified dataframe
temp <- bind_rows(temp,y)
}
}
#rename (this is redundant)
maps <- temp
#remove unnecessary intermediate files
rm(temp,y,z,i,my.max)
Finally, let’s save these, both as R objects and tab-delimited tables
saveRDS(object = maps,
file = "../2_LinMap/8_1_LepMap_Maps.RData")
write_delim(maps, file = "../2_LinMap/8_2_HC_LepMap_Maps.txt",
delim = "\t",col_names = T)
maps <- maps %>% mutate(LG = fct_relevel(LG, "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"))
p1 <- ggplot(maps,
aes(x=female, y=male,
group = LG, colour = LG)) +
geom_point(size=0.5) + theme_classic() +
scale_y_continuous(limits = c(0, 170),
breaks = seq(0, 150, by = 50)) +
scale_x_continuous(limits = c(0,265),
breaks = seq(0, 250, by = 50))
p1
## Warning: Removed 7033 rows containing missing values (geom_point).
From this we can already see that some linkage groups (e.g., LG2) have really long gaps consisting of no shared markers.
ggsave(filename = "../2_LinMap/8_3_MaleVsFemale_All.jpg",
plot = p1, width = 7, height = 5)
That figure is a bit confusing, showing many LGs at once. Let’s plot each LG individually.
maps.NA <- maps %>% drop_na()
p2 <- ggplot(maps.NA, aes(x=female, y=male,
group = LG, colour = LG)) +
geom_point() + facet_wrap(~LG, ncol = 5) +
ylab("") + xlab("") +
theme(strip.background = element_blank(),
strip.text.x = element_blank()) +
theme_classic()
p2
save the resulting figure
ggsave(filename = "../2_LinMap/8_4_HC_MaleVsFemale.jpg",
plot = p2, width = 12, height = 12)
Especially from the second graph we can see that some linkage groups have very long gaps with no shared markers. While this per-se is not an issue, (could be sex-linked regions with unusual segregation that can’t be mapped in one sex), we can identify areas that are likely to be erroneous and better if removed (i.e., 1-2 markers causing a 10-20+ cM gap).
I have not found a way to detect them automatically yet. So to achieve this, I inspect the male vs female plots, find linkage groups with unusually long gaps (e.g., LG7 for KI), and inspect the end with a long gap. We will do this one LG at a time.
p1 <- ggplot(maps,
aes(x=female, y=male,
group = LG, colour = LG)) +
geom_point(size=0.1) + facet_wrap(~LG, ncol = 5) +
ylab("") + xlab("") +
theme_classic() +
theme(strip.background = element_blank(),
strip.text.x = element_text(size = 5))
p1
## Warning: Removed 7033 rows containing missing values (geom_point).
Here we want to assess LGs 1, 2, 4, 5, 9 and 11 because of long gaps caused by few markers at either end.
To make our life easier when sorting we first calculate the sex-average position for each marker. (male + female / 2 if present in both, only one sex if present in only one sex).
maps <- maps %>%
mutate(sex.averaged = case_when(
is.na(male) == TRUE ~ female,
is.na(female) == TRUE ~ male,
TRUE ~ (male+female)/2)) %>%
arrange(sex.averaged)
p1 <- ggplot(maps %>% filter(LG == 1),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 906 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 1) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 483 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 1) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 423 rows containing missing values (geom_point).
For LG 1 there’s no obvious culprit. There are gaps of non-shared markers but both the female and male maps increase in small steps of < 10 cM
p1 <- ggplot(maps %>% filter(LG == 2),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 885 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 2) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 436 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 2) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 449 rows containing missing values (geom_point).
This time we have a clear long gap in the start of the male map. Let’s identify the marker causing it.
maps %>% filter(LG==2) %>% arrange(male) %>% head(n=20)
## # A tibble: 20 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 1886_53 2124 2 0 NA 0
## 2 1886_57 2125 2 0 NA 0
## 3 1886_8 2121 2 0.327 NA 0.327
## 4 1886_34 2123 2 0.327 NA 0.327
## 5 4043_39 2414 2 49.4 NA 49.4
## 6 51802_26 1542 2 51.8 NA 51.8
## 7 22927_21 4996 2 52.1 NA 52.1
## 8 247_68 1848 2 52.7 NA 52.7
## 9 43893_59 7560 2 55.1 NA 55.1
## 10 3480_67 2328 2 58.1 NA 58.1
## 11 27071_56 5635 2 60.1 NA 60.1
## 12 9935_28 331 2 60.4 3.98 32.2
## 13 9935_62 3171 2 60.4 NA 60.4
## 14 9319_61 3075 2 61.7 NA 61.7
## 15 41724_48 7376 2 64.4 NA 64.4
## 16 16392_48 534 2 66.7 7.63 37.2
## 17 2893_14 2234 2 68.7 NA 68.7
## 18 2893_57 2236 2 68.7 NA 68.7
## 19 2893_51 123 2 69.4 8.28 38.8
## 20 11784_67 3441 2 71.0 NA 71.0
In this case is 4 snps from the 1886 locus causing a gap of more than 49 cM. We will thus remove this locus as it was likely incorrectly assembled in stacks.
p1 <- ggplot(maps %>% filter(LG == 3),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 872 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 3) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 440 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 3) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 432 rows containing missing values (geom_point).
LG 3 looks good.
p1 <- ggplot(maps %>% filter(LG == 4),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 755 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 4) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 374 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 4) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 381 rows containing missing values (geom_point).
This time we have a clear long gap in the start of the male AND the female map. Let’s identify the markers causing it.
maps %>% filter(LG==4) %>% arrange(male) %>% head(n=20)
## # A tibble: 20 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 19546_11 620 4 0 NA 0
## 2 19546_37 625 4 0.987 0 0.494
## 3 160312_42 1633 4 16.5 16.2 16.4
## 4 160312_10 1632 4 16.5 16.2 16.4
## 5 40484_25 7270 4 16.5 NA 16.5
## 6 44143_9 7571 4 16.5 NA 16.5
## 7 37599_64 6932 4 16.9 NA 16.9
## 8 160312_36 8579 4 16.9 NA 16.9
## 9 28783_19 5864 4 17.5 NA 17.5
## 10 28783_50 5865 4 17.5 NA 17.5
## 11 28783_52 5866 4 17.5 NA 17.5
## 12 8548_27 299 4 18.5 16.8 17.7
## 13 19831_27 4564 4 18.5 NA 18.5
## 14 19831_41 4565 4 18.8 NA 18.8
## 15 19831_23 4563 4 18.8 NA 18.8
## 16 19831_12 4562 4 18.8 NA 18.8
## 17 18560_52 594 4 21.2 17.2 19.2
## 18 45153_21 7641 4 22.5 NA 22.5
## 19 18868_48 4431 4 23.8 NA 23.8
## 20 34757_14 6599 4 26.1 NA 26.1
For the male 19546 locus is causing a >15 cM gap.
maps %>% filter(LG==4) %>% arrange(female) %>% head(n=20)
## # A tibble: 20 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 19546_12 621 4 NA 0 0
## 2 19546_37 625 4 0.987 0 0.494
## 3 34343_34 6541 4 NA 15.5 15.5
## 4 35481_52 6699 4 NA 15.5 15.5
## 5 4551_63 2490 4 NA 15.5 15.5
## 6 30077_16 6050 4 NA 16.2 16.2
## 7 160312_42 1633 4 16.5 16.2 16.4
## 8 160312_10 1632 4 16.5 16.2 16.4
## 9 24418_41 5194 4 NA 16.5 16.5
## 10 28783_66 5867 4 NA 16.5 16.5
## 11 8548_27 299 4 18.5 16.8 17.7
## 12 19512_52 4528 4 NA 17.2 17.2
## 13 24418_34 5193 4 NA 17.2 17.2
## 14 18560_52 594 4 21.2 17.2 19.2
## 15 45153_24 7642 4 NA 17.5 17.5
## 16 15487_35 3964 4 NA 17.8 17.8
## 17 34757_10 6598 4 NA 18.5 18.5
## 18 8100_27 285 4 27.4 19.1 23.3
## 19 30265_55 6067 4 NA 20.5 20.5
## 20 31767_6 6222 4 NA 20.5 20.5
The same locus is problematic in the female. We will thus remove it altogether.
maps %>% filter(LG == 4) %>% filter(str_detect(SNPID,"^19546_"))
## # A tibble: 3 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 19546_12 621 4 NA 0 0
## 2 19546_11 620 4 0 NA 0
## 3 19546_37 625 4 0.987 0 0.494
All three markers from this locus occur at the start of the map. We will remove all three.
p1 <- ggplot(maps %>% filter(LG == 5),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 640 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 5) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 361 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 5) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 279 rows containing missing values (geom_point).
This time we have a clear long gap in the start of the female map. Let’s identify the markers causing it.
maps %>% filter(LG==5) %>% arrange(female) %>% head(n=20)
## # A tibble: 20 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 26181_55 5482 5 NA 0 0
## 2 27284_14 891 5 1.98 20.7 11.3
## 3 20323_51 4660 5 NA 20.7 20.7
## 4 25389_57 5333 5 NA 20.7 20.7
## 5 20323_48 4659 5 NA 20.7 20.7
## 6 13588_7 3705 5 NA 21.3 21.3
## 7 13588_55 3706 5 NA 21.3 21.3
## 8 13873_63 461 5 2.31 21.7 12.0
## 9 48968_67 1482 5 2.31 22.0 12.2
## 10 48968_6 7925 5 NA 22.0 22.0
## 11 20454_22 656 5 3.96 22.3 13.1
## 12 8961_53 309 5 3.96 22.3 13.1
## 13 29928_50 975 5 4.62 22.3 13.5
## 14 29928_41 6022 5 NA 22.3 22.3
## 15 14093_61 3776 5 NA 22.3 22.3
## 16 13080_11 3638 5 NA 22.3 22.3
## 17 14093_48 3774 5 NA 22.3 22.3
## 18 14093_57 3775 5 NA 22.3 22.3
## 19 9600_44 3118 5 NA 22.3 22.3
## 20 3945_55 2402 5 NA 23.6 23.6
One marker, 26181_55, is causing a ~20 cM gap. We will remove it. First, let’s check if there are other markers from that locus both in the linkage group and in the whole map.
maps %>% filter(LG == 5) %>% filter(str_detect(SNPID,"^26181_"))
## # A tibble: 1 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 26181_55 5482 5 NA 0 0
maps %>% filter(str_detect(SNPID,"^26181_"))
## # A tibble: 1 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 26181_55 5482 5 NA 0 0
Only the one (this is good as I did check in an exploratory analyses whether any locus was spread in more than 1 LG and made sure that wasn’t the case)
p1 <- ggplot(maps %>% filter(LG == 6),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 602 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 6) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 312 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 6) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 290 rows containing missing values (geom_point).
LG6 looks good
p1 <- ggplot(maps %>% filter(LG == 7),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 429 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 7) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 227 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 7) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 202 rows containing missing values (geom_point).
LG7 looks good
p1 <- ggplot(maps %>% filter(LG == 8),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 377 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 8) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 195 rows containing missing values (geom_point).
There is a long gap at one end, let’s check how many loci are included.
maps %>% filter(LG==8) %>%
arrange(male) %>%
drop_na(male) %>%
tail(n=10)
## # A tibble: 10 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 8137_6 286 8 95.5 117. 106.
## 2 7715_51 2886 8 96.2 NA 96.2
## 3 12880_45 3604 8 96.5 NA 96.5
## 4 9306_46 3073 8 102. NA 102.
## 5 47323_7 7806 8 102. NA 102.
## 6 9306_64 318 8 105. NA 105.
## 7 2634_6 2190 8 129. NA 129.
## 8 2634_27 2193 8 134. NA 134.
## 9 2634_20 2191 8 135. NA 135.
## 10 2634_21 2192 8 136. NA 136.
It’s one locus causing a 25+ cM gap. This looks sus so we will remove locus 2634.
temp <- maps %>% filter(LG == 8) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 182 rows containing missing values (geom_point).
The female map looks good.
p1 <- ggplot(maps %>% filter(LG == 9),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 381 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 9) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 208 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 9) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 173 rows containing missing values (geom_point).
This time we have a clear long gap at the end of the male map.
maps %>% filter(LG==9) %>% drop_na(male) %>% arrange(desc(male)) %>% head(n=20)
## # A tibble: 20 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 56691_24 8218 9 151. NA 151.
## 2 56691_26 8219 9 151. NA 151.
## 3 50271_65 1509 9 116. 112. 114.
## 4 50271_36 1507 9 116. 112. 114.
## 5 50271_64 1508 9 116. 112. 114.
## 6 50271_45 8033 9 116. NA 116.
## 7 2698_45 2200 9 114. NA 114.
## 8 16735_50 543 9 113. 109. 111.
## 9 19813_19 638 9 113. 109. 111.
## 10 48250_57 1468 9 113. 109. 111.
## 11 39327_62 1268 9 113. 109. 111.
## 12 13088_10 3640 9 113. NA 113.
## 13 48802_18 7912 9 113. NA 113.
## 14 47546_29 7826 9 113. NA 113.
## 15 47546_61 7829 9 113. NA 113.
## 16 28133_29 912 9 113. 109. 111.
## 17 493_48 41 9 113. 109. 111.
## 18 17882_46 4298 9 112. NA 112.
## 19 16285_45 525 9 110. 109. 109.
## 20 47052_29 7781 9 110. NA 110.
One locus, 56691, is causing a ~40 cM gap. We will remove it. First, let’s check if there are other markers from that locus both in the linkage group and in the whole map.
maps %>% filter(LG == 9) %>% filter(str_detect(SNPID,"^56691_"))
## # A tibble: 2 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 56691_24 8218 9 151. NA 151.
## 2 56691_26 8219 9 151. NA 151.
maps %>% filter(str_detect(SNPID,"^56691_"))
## # A tibble: 2 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 56691_24 8218 9 151. NA 151.
## 2 56691_26 8219 9 151. NA 151.
Only the two we already identified, good.
p1 <- ggplot(maps %>% filter(LG == 10),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 363 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 10) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 166 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 10) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 197 rows containing missing values (geom_point).
LG10 looks good.
p1 <- ggplot(maps %>% filter(LG == 11),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 295 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 11) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 159 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 11) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 136 rows containing missing values (geom_point).
For LG 11 we have markers causing long gaps at the end in both maps.
maps %>% filter(LG==11) %>% drop_na(male) %>% arrange(desc(male)) %>% head(n=20)
## # A tibble: 20 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 19546_24 624 11 120. 108. 114.
## 2 19546_18 623 11 120. 108. 114.
## 3 19546_53 626 11 120. 108. 114.
## 4 19546_13 622 11 120. NA 120.
## 5 30387_6 1004 11 98.4 79.4 88.9
## 6 36376_55 6797 11 91.8 NA 91.8
## 7 38551_20 7027 11 86.7 NA 86.7
## 8 40264_21 7236 11 86.0 NA 86.0
## 9 40264_44 7237 11 86.0 NA 86.0
## 10 40264_19 7235 11 86.0 NA 86.0
## 11 40480_64 7269 11 80.9 NA 80.9
## 12 40480_62 7268 11 80.9 NA 80.9
## 13 40480_41 7267 11 80.9 NA 80.9
## 14 7508_62 2856 11 79.6 NA 79.6
## 15 26559_56 5531 11 71.1 NA 71.1
## 16 22684_16 4966 11 69.8 NA 69.8
## 17 21243_21 678 11 68.1 69.8 69.0
## 18 17513_29 569 11 67.8 69.8 68.8
## 19 33498_12 6435 11 66.5 NA 66.5
## 20 33498_22 1101 11 66.5 69.5 68.0
maps %>% filter(LG==11) %>% drop_na(female) %>% arrange(desc(female)) %>% head(n=20)
## # A tibble: 20 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 19546_18 623 11 120. 108. 114.
## 2 19546_53 626 11 120. 108. 114.
## 3 19546_24 624 11 120. 108. 114.
## 4 30387_6 1004 11 98.4 79.4 88.9
## 5 30387_12 6079 11 NA 76.4 76.4
## 6 4546_14 2488 11 NA 76.1 76.1
## 7 24830_22 5245 11 NA 75.8 75.8
## 8 24830_11 5244 11 NA 75.8 75.8
## 9 24830_8 5243 11 NA 75.8 75.8
## 10 13300_54 3665 11 NA 75.4 75.4
## 11 38551_48 7028 11 NA 75.1 75.1
## 12 7508_40 2855 11 NA 74.1 74.1
## 13 7508_34 2854 11 NA 74.1 74.1
## 14 7508_16 2853 11 NA 74.1 74.1
## 15 67395_68 8331 11 NA 71.8 71.8
## 16 37074_22 6869 11 NA 70.5 70.5
## 17 37074_46 6870 11 NA 70.5 70.5
## 18 37074_68 6871 11 NA 70.5 70.5
## 19 26559_55 5530 11 NA 70.5 70.5
## 20 41691_66 7372 11 NA 70.1 70.1
One locus, 19546, is causing a ~20 cM gap. We will remove it. First, let’s check if there are other markers from that locus both in the linkage group and in the whole map.
maps %>% filter(LG == 11) %>% filter(str_detect(SNPID,"^19546_"))
## # A tibble: 4 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 19546_18 623 11 120. 108. 114.
## 2 19546_53 626 11 120. 108. 114.
## 3 19546_24 624 11 120. 108. 114.
## 4 19546_13 622 11 120. NA 120.
maps %>% filter(str_detect(SNPID,"^19546_"))
## # A tibble: 7 × 6
## SNPID ID LG male female sex.averaged
## <chr> <dbl> <fct> <dbl> <dbl> <dbl>
## 1 19546_12 621 4 NA 0 0
## 2 19546_11 620 4 0 NA 0
## 3 19546_37 625 4 0.987 0 0.494
## 4 19546_18 623 11 120. 108. 114.
## 5 19546_53 626 11 120. 108. 114.
## 6 19546_24 624 11 120. 108. 114.
## 7 19546_13 622 11 120. NA 120.
This time the problematic locus is found in LG 4 as well. This is the locus we previously identified to remove. Good that we spotted this.
p1 <- ggplot(maps %>% filter(LG == 12),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 281 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 12) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 171 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 12) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 110 rows containing missing values (geom_point).
LG12 looks good
p1 <- ggplot(maps %>% filter(LG == 13),
aes(x=female, y=male)) +
geom_point(size=0.1) +
theme_classic()
p1
## Warning: Removed 247 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 13) %>% arrange(male)
qplot(seq_along(temp$male), temp$male) +
theme_classic()
## Warning: Removed 102 rows containing missing values (geom_point).
temp <- maps %>% filter(LG == 13) %>% arrange(female)
qplot(seq_along(temp$female), temp$female) +
theme_classic()
## Warning: Removed 145 rows containing missing values (geom_point).
LG13 looks also good.
Now, let’s make a list of all markers to remove
HCrem <- c("1886_8","1886_34",
"1886_53","1886_57", #LG2
"26181_55", #LG5
"19546_11","19546_37", #LG4
"2634_6","2634_20",
"2634_21","2634_27", #LG8
"19546_13","19546_18",
"56691_24", "56691_26", #LG9
"19546_24","19546_53") #LG11
And save it.
write_delim(as.data.frame(HCrem),
file = "../2_LinMap/8_5_HC_toRemove.txt",
delim = "/t",
col_names = FALSE)
And remove redundant/intermediate files.
rm(p1,p2,temp,HCrem,maps.NA)
Now, we saw with LG 4 and 11 that they shared a locus (i.e., same stack but different SNPs). This is of course impossible and the result of poorly assembled loci. To address this, let’s see if there are any other loci occuring on multiple linkage groups.
The first step is to create a column with the locus ID only.
maps <- maps %>% mutate(locus = gsub("_.*$","", SNPID))
Now loop through locus ID and retain if they appear in more than one LG.
bad.loci <- c("bad.loci")
for (i in maps$locus) {
y <- maps %>% filter(locus == i)
if (length(unique(y$LG)) > 1) {
bad.loci <- c(bad.loci,i)
}
}
bad.loci
## [1] "bad.loci" "19546" "19546" "19546" "19546" "19546" "19546"
## [8] "19546"
The only one occurring in more than one LG is locus 19546 which we identified above. Good!