Aliases: grep grepl sub gsub regexpr gregexpr regexec gregexec
### ** Examples grep("[a-z]", letters)
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 [26] 26
txt <- c("arm","foot","lefroo", "bafoobar") if(length(i <- grep("foo", txt))) cat("'foo' appears at least once in\n\t", txt, "\n")
'foo' appears at least once in arm foot lefroo bafoobar
i # 2 and 4
[1] 2 4
txt[i]
[1] "foot" "bafoobar"
## Double all 'a' or 'b's; "\" must be escaped, i.e., 'doubled' gsub("([ab])", "\\1_\\1_", "abc and ABC")
[1] "a_a_b_b_c a_a_nd ABC"
txt <- c("The", "licenses", "for", "most", "software", "are", "designed", "to", "take", "away", "your", "freedom", "to", "share", "and", "change", "it.", "", "By", "contrast,", "the", "GNU", "General", "Public", "License", "is", "intended", "to", "guarantee", "your", "freedom", "to", "share", "and", "change", "free", "software", "--", "to", "make", "sure", "the", "software", "is", "free", "for", "all", "its", "users") ( i <- grep("[gu]", txt) ) # indices
[1] 7 11 16 24 29 30 35 41 49
stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) ) ## Note that for some implementations character ranges are ## locale-dependent (but not currently). Then [b-e] in locales such as ## en_US may include B as the collation order is aAbBcCdDe ... (ot <- sub("[b-e]",".", txt))
[1] "Th." "li.enses" "for" "most" "softwar." "ar." [7] ".esigned" "to" "tak." "away" "your" "fr.edom" [13] "to" "shar." "an." ".hange" "it." "" [19] "By" ".ontrast," "th." "GNU" "G.neral" "Pu.lic" [25] "Li.ense" "is" "int.nded" "to" "guarant.e" "your" [31] "fr.edom" "to" "shar." "an." ".hange" "fr.e" [37] "softwar." "--" "to" "mak." "sur." "th." [43] "softwar." "is" "fr.e" "for" "all" "its" [49] "us.rs"
txt[ot != gsub("[b-e]",".", txt)]#- gsub does "global" substitution
[1] "licenses" "designed" "freedom" "change" "General" "Public" [7] "License" "intended" "guarantee" "freedom" "change" "free" [13] "free"
## In caseless matching, ranges include both cases: a <- grep("[b-e]", txt, value = TRUE) b <- grep("[b-e]", txt, ignore.case = TRUE, value = TRUE) setdiff(b, a)
[1] "By"
txt[gsub("g","#", txt) != gsub("g","#", txt, ignore.case = TRUE)] # the "G" words
[1] "GNU" "General"
regexpr("en", txt)
[1] -1 4 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 2 -1 4 [26] -1 4 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 attr(,"match.length") [1] -1 2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 2 -1 2 [26] -1 2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE
gregexpr("e", txt)
[[1]] [1] 3 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[2]] [1] 4 7 attr(,"match.length") [1] 1 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[3]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[4]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[5]] [1] 8 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[6]] [1] 3 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[7]] [1] 2 7 attr(,"match.length") [1] 1 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[8]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[9]] [1] 4 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[10]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[11]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[12]] [1] 3 4 attr(,"match.length") [1] 1 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[13]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[14]] [1] 5 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[15]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[16]] [1] 6 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[17]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[18]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[19]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[20]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[21]] [1] 3 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[22]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[23]] [1] 2 4 attr(,"match.length") [1] 1 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[24]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[25]] [1] 4 7 attr(,"match.length") [1] 1 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[26]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[27]] [1] 4 7 attr(,"match.length") [1] 1 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[28]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[29]] [1] 8 9 attr(,"match.length") [1] 1 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[30]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[31]] [1] 3 4 attr(,"match.length") [1] 1 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[32]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[33]] [1] 5 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[34]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[35]] [1] 6 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[36]] [1] 3 4 attr(,"match.length") [1] 1 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[37]] [1] 8 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[38]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[39]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[40]] [1] 4 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[41]] [1] 4 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[42]] [1] 3 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[43]] [1] 8 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[44]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[45]] [1] 3 4 attr(,"match.length") [1] 1 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[46]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[47]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[48]] [1] -1 attr(,"match.length") [1] -1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE [[49]] [1] 3 attr(,"match.length") [1] 1 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE
## Using grepl() for filtering ## Find functions with argument names matching "warn": findArgs <- function(env, pattern) { nms <- ls(envir = as.environment(env)) nms <- nms[is.na(match(nms, c("F","T")))] # <-- work around "checking hack" aa <- sapply(nms, function(.) { o <- get(.) if(is.function(o)) names(formals(o)) }) iw <- sapply(aa, function(a) any(grepl(pattern, a, ignore.case=TRUE))) aa[iw] } findArgs("package:base", "warn")
$attach [1] "what" "pos" "name" "warn.conflicts" $dir.create [1] "path" "showWarnings" "recursive" "mode" $file.create [1] "..." "showWarnings" $library [1] "package" "help" "pos" "lib.loc" [5] "character.only" "logical.return" "warn.conflicts" "quietly" [9] "verbose" "mask.ok" "exclude" "include.only" [13] "attach.required" $readLines [1] "con" "n" "ok" "warn" "encoding" "skipNul" $require [1] "package" "lib.loc" "quietly" "warn.conflicts" [5] "character.only" "mask.ok" "exclude" "include.only" [9] "attach.required"
## trim trailing white space str <- "Now is the time " sub(" +$", "", str) ## spaces only
[1] "Now is the time"
## what is considered 'white space' depends on the locale. sub("[[:space:]]+$", "", str) ## white space, POSIX-style
[1] "Now is the time"
## what PCRE considered white space changed in version 8.34: see ?regex sub("\\s+$", "", str, perl = TRUE) ## PCRE-style white space
[1] "Now is the time"
## capitalizing txt <- "a test of capitalizing" gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE)
[1] "A Test Of Capitalizing"
gsub("\\b(\\w)", "\\U\\1", txt, perl=TRUE)
[1] "A Test Of Capitalizing"
txt2 <- "useRs may fly into JFK or laGuardia" gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
[1] "UseRS MaY FlY IntO JFK OR LaGuardiA"
sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
[1] "UseRS may fly into JFK or laGuardia"
## named capture notables <- c(" Ben Franklin and Jefferson Davis", "\tMillard Fillmore") # name groups 'first' and 'last' name.rex <- "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)" (parsed <- regexpr(name.rex, notables, perl = TRUE))
[1] 3 2 attr(,"match.length") [1] 12 16 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE attr(,"capture.start") first last [1,] 3 7 [2,] 2 10 attr(,"capture.length") first last [1,] 3 8 [2,] 7 8 attr(,"capture.names") [1] "first" "last"
gregexpr(name.rex, notables, perl = TRUE)[[2]]
[1] 2 attr(,"match.length") [1] 16 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE attr(,"capture.start") first last [1,] 2 10 attr(,"capture.length") first last [1,] 7 8 attr(,"capture.names") [1] "first" "last"
parse.one <- function(res, result) { m <- do.call(rbind, lapply(seq_along(res), function(i) { if(result[i] == -1) return("") st <- attr(result, "capture.start")[i, ] substring(res[i], st, st + attr(result, "capture.length")[i, ] - 1) })) colnames(m) <- attr(result, "capture.names") m } parse.one(notables, parsed)
first last [1,] "Ben" "Franklin" [2,] "Millard" "Fillmore"
## Decompose a URL into its components. ## Example by LT (http://www.cs.uiowa.edu/~luke/R/regexp.html). x <- "http://stat.umn.edu:80/xyz" m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x) m
[[1]] [1] 1 1 1 8 20 21 23 attr(,"match.length") [1] 26 7 4 12 3 2 4 attr(,"index.type") [1] "chars" attr(,"useBytes") [1] TRUE
regmatches(x, m)
[[1]] [1] "http://stat.umn.edu:80/xyz" "http://" [3] "http" "stat.umn.edu" [5] ":80" "80" [7] "/xyz"
## Element 3 is the protocol, 4 is the host, 6 is the port, and 7 ## is the path. We can use this to make a function for extracting the ## parts of a URL: URL_parts <- function(x) { m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x) parts <- do.call(rbind, lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L))) colnames(parts) <- c("protocol","host","port","path") parts } URL_parts(x)
protocol host port path [1,] "http" "stat.umn.edu" "80" "/xyz"
## gregexec() may match multiple times within a single string. pattern <- "([[:alpha:]]+)([[:digit:]]+)" s <- "Test: A1 BC23 DEF456" m <- gregexec(pattern, s) m
[[1]] [,1] [,2] [,3] [1,] 7 10 15 [2,] 7 10 15 [3,] 8 12 18 attr(,"match.length") [,1] [,2] [,3] [1,] 2 4 6 [2,] 1 2 3 [3,] 1 2 3 attr(,"useBytes") [1] TRUE attr(,"index.type") [1] "chars"
regmatches(s, m)
[[1]] [,1] [,2] [,3] [1,] "A1" "BC23" "DEF456" [2,] "A" "BC" "DEF" [3,] "1" "23" "456"
## Before gregexec() was implemented, one could emulate it by running ## regexec() on the regmatches obtained via gregexpr(). E.g.: lapply(regmatches(s, gregexpr(pattern, s)), function(e) regmatches(e, regexec(pattern, e)))
[[1]] [[1]][[1]] [1] "A1" "A" "1" [[1]][[2]] [1] "BC23" "BC" "23" [[1]][[3]] [1] "DEF456" "DEF" "456"