## Time-stamp: <2020-01-31 13:53:49 chl> ## ## Estimate empirically the probability of finding a substring in a larger ## string, given a four-letter alphabet. ## This should be 1 - (1 - 1/4^k)^n, where k is the length of the substring ## and n that of the larger string. ## ## NOTE We should probably replace 1 - (.)^n with 1 - (.)^(n-k+1) for small ## string, and also correct the equiprobable outcome (1/4) with GC content. ## Finally, for exact computation, a discrete Markov model could be used, ## e.g. https://stats.stackexchange.com/a/362638. ## alph <- c("A", "C", "G", "T") sseq <- "AGCAAT" trials <- 1e4 pr <- 1 - (1 - 1 / 4^nchar(sseq))^len prc <- 1 - (1 - 1 / 4^nchar(sseq))^(len - nchar(sseq) + 1) sim <- function(n, pat) { str <- paste(sample(alph, n, rep = TRUE), collapse = "") out <- gregexpr(pat, str) if (length(out[[1]]) == 1 && out[[1]] == -1) { return(0) } else { return(length(out[[1]])) } } len <- seq(100, 1e4, length.out = 10) res <- pp <- numeric(length(len)) for (k in seq(along = len)) { tmp <- replicate(trials, sim(len[k], sseq)) res[k] <- sum(tmp > 0) / trials pp[k] <- (1 - (1 - 1 / 4^nchar(sseq))^len[k]) } plot(len, res, xlab = "Genome length", ylab = paste0("Pr(", sseq, ")"), ylim = c(0, 1)) lines(len, pp, col = "orange", lwd = 2) len <- seq(10, 100, length.out = 10) res <- pp <- ppc <- numeric(length(len)) for (k in seq(along = len)) { tmp <- replicate(trials, sim(len[k], sseq)) res[k] <- sum(tmp > 0) / trials pp[k] <- (1 - (1 - 1 / 4^nchar(sseq))^len[k]) ppc[k] <- (1 - (1 - 1 / 4^nchar(sseq))^(len[k] - nchar(sseq) + 1)) } plot(len, -log10(res), xlab = "Genome length", ylab = paste0("-log10[Pr(", sseq, ")]")) lines(len, -log10(pp), col = "orange", lwd = 2) lines(len, -log10(ppc), col = "dodgerblue", lwd = 2)