R: Examples for 'base::grep'

### ** Examples

grep("[a-z]", letters)

 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26

txt <- c("arm","foot","lefroo", "bafoobar")
if(length(i <- grep("foo", txt)))
   cat("'foo' appears at least once in\n\t", txt, "\n")

'foo' appears at least once in
	 arm foot lefroo bafoobar

i # 2 and 4

[1] 2 4

txt[i]

[1] "foot"     "bafoobar"

## Double all 'a' or 'b's;  "\" must be escaped, i.e., 'doubled'
gsub("([ab])", "\\1_\\1_", "abc and ABC")

[1] "a_a_b_b_c a_a_nd ABC"

txt <- c("The", "licenses", "for", "most", "software", "are",
  "designed", "to", "take", "away", "your", "freedom",
  "to", "share", "and", "change", "it.",
  "", "By", "contrast,", "the", "GNU", "General", "Public", "License",
  "is", "intended", "to", "guarantee", "your", "freedom", "to",
  "share", "and", "change", "free", "software", "--",
  "to", "make", "sure", "the", "software", "is",
  "free", "for", "all", "its", "users")
( i <- grep("[gu]", txt) ) # indices

[1]  7 11 16 24 29 30 35 41 49

stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) )

## Note that for some implementations character ranges are
## locale-dependent (but not currently).  Then [b-e] in locales such as
## en_US may include B as the collation order is aAbBcCdDe ...
(ot <- sub("[b-e]",".", txt))

 [1] "Th."       "li.enses"  "for"       "most"      "softwar."  "ar."      
 [7] ".esigned"  "to"        "tak."      "away"      "your"      "fr.edom"  
[13] "to"        "shar."     "an."       ".hange"    "it."       ""         
[19] "By"        ".ontrast," "th."       "GNU"       "G.neral"   "Pu.lic"   
[25] "Li.ense"   "is"        "int.nded"  "to"        "guarant.e" "your"     
[31] "fr.edom"   "to"        "shar."     "an."       ".hange"    "fr.e"     
[37] "softwar."  "--"        "to"        "mak."      "sur."      "th."      
[43] "softwar."  "is"        "fr.e"      "for"       "all"       "its"      
[49] "us.rs"

txt[ot != gsub("[b-e]",".", txt)]#- gsub does "global" substitution

 [1] "licenses"  "designed"  "freedom"   "change"    "General"   "Public"   
 [7] "License"   "intended"  "guarantee" "freedom"   "change"    "free"     
[13] "free"

## In caseless matching, ranges include both cases:
a <- grep("[b-e]", txt, value = TRUE)
b <- grep("[b-e]", txt, ignore.case = TRUE, value = TRUE)
setdiff(b, a)

[1] "By"

txt[gsub("g","#", txt) !=
    gsub("g","#", txt, ignore.case = TRUE)] # the "G" words

[1] "GNU"     "General"

regexpr("en", txt)

 [1] -1  4 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  2 -1  4
[26] -1  4 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
attr(,"match.length")
 [1] -1  2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  2 -1  2
[26] -1  2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

gregexpr("e", txt)

[[1]]
[1] 3
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[2]]
[1] 4 7
attr(,"match.length")
[1] 1 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[3]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[4]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[5]]
[1] 8
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[6]]
[1] 3
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[7]]
[1] 2 7
attr(,"match.length")
[1] 1 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[8]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[9]]
[1] 4
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[10]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[11]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[12]]
[1] 3 4
attr(,"match.length")
[1] 1 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[13]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[14]]
[1] 5
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[15]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[16]]
[1] 6
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[17]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[18]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[19]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[20]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[21]]
[1] 3
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[22]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[23]]
[1] 2 4
attr(,"match.length")
[1] 1 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[24]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[25]]
[1] 4 7
attr(,"match.length")
[1] 1 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[26]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[27]]
[1] 4 7
attr(,"match.length")
[1] 1 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[28]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[29]]
[1] 8 9
attr(,"match.length")
[1] 1 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[30]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[31]]
[1] 3 4
attr(,"match.length")
[1] 1 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[32]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[33]]
[1] 5
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[34]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[35]]
[1] 6
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[36]]
[1] 3 4
attr(,"match.length")
[1] 1 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[37]]
[1] 8
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[38]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[39]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[40]]
[1] 4
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[41]]
[1] 4
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[42]]
[1] 3
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[43]]
[1] 8
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[44]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[45]]
[1] 3 4
attr(,"match.length")
[1] 1 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[46]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[47]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[48]]
[1] -1
attr(,"match.length")
[1] -1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

[[49]]
[1] 3
attr(,"match.length")
[1] 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

## Using grepl() for filtering
## Find functions with argument names matching "warn":
findArgs <- function(env, pattern) {
  nms <- ls(envir = as.environment(env))
  nms <- nms[is.na(match(nms, c("F","T")))] # <-- work around "checking hack"
  aa <- sapply(nms, function(.) { o <- get(.)
               if(is.function(o)) names(formals(o)) })
  iw <- sapply(aa, function(a) any(grepl(pattern, a, ignore.case=TRUE)))
  aa[iw]
}
findArgs("package:base", "warn")

$attach
[1] "what"           "pos"            "name"           "warn.conflicts"

$dir.create
[1] "path"         "showWarnings" "recursive"    "mode"        

$file.create
[1] "..."          "showWarnings"

$library
 [1] "package"         "help"            "pos"             "lib.loc"        
 [5] "character.only"  "logical.return"  "warn.conflicts"  "quietly"        
 [9] "verbose"         "mask.ok"         "exclude"         "include.only"   
[13] "attach.required"

$readLines
[1] "con"      "n"        "ok"       "warn"     "encoding" "skipNul" 

$require
[1] "package"         "lib.loc"         "quietly"         "warn.conflicts" 
[5] "character.only"  "mask.ok"         "exclude"         "include.only"   
[9] "attach.required"

## trim trailing white space
str <- "Now is the time      "
sub(" +$", "", str)  ## spaces only

[1] "Now is the time"

## what is considered 'white space' depends on the locale.
sub("[[:space:]]+$", "", str) ## white space, POSIX-style

[1] "Now is the time"

## what PCRE considered white space changed in version 8.34: see ?regex
sub("\\s+$", "", str, perl = TRUE) ## PCRE-style white space

[1] "Now is the time"

## capitalizing
txt <- "a test of capitalizing"
gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE)

[1] "A Test Of Capitalizing"

gsub("\\b(\\w)",    "\\U\\1",       txt, perl=TRUE)

[1] "A Test Of Capitalizing"

txt2 <- "useRs may fly into JFK or laGuardia"
gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)

[1] "UseRS MaY FlY IntO JFK OR LaGuardiA"

 sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)

[1] "UseRS may fly into JFK or laGuardia"

## named capture
notables <- c("  Ben Franklin and Jefferson Davis",
              "\tMillard Fillmore")
# name groups 'first' and 'last'
name.rex <- "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"
(parsed <- regexpr(name.rex, notables, perl = TRUE))

[1] 3 2
attr(,"match.length")
[1] 12 16
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
attr(,"capture.start")
     first last
[1,]     3    7
[2,]     2   10
attr(,"capture.length")
     first last
[1,]     3    8
[2,]     7    8
attr(,"capture.names")
[1] "first" "last"

gregexpr(name.rex, notables, perl = TRUE)[[2]]

[1] 2
attr(,"match.length")
[1] 16
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
attr(,"capture.start")
     first last
[1,]     2   10
attr(,"capture.length")
     first last
[1,]     7    8
attr(,"capture.names")
[1] "first" "last"

parse.one <- function(res, result) {
  m <- do.call(rbind, lapply(seq_along(res), function(i) {
    if(result[i] == -1) return("")
    st <- attr(result, "capture.start")[i, ]
    substring(res[i], st, st + attr(result, "capture.length")[i, ] - 1)
  }))
  colnames(m) <- attr(result, "capture.names")
  m
}
parse.one(notables, parsed)

     first     last      
[1,] "Ben"     "Franklin"
[2,] "Millard" "Fillmore"

## Decompose a URL into its components.
## Example by LT (http://www.cs.uiowa.edu/~luke/R/regexp.html).
x <- "http://stat.umn.edu:80/xyz"
m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
m

[[1]]
[1]  1  1  1  8 20 21 23
attr(,"match.length")
[1] 26  7  4 12  3  2  4
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE

regmatches(x, m)

[[1]]
[1] "http://stat.umn.edu:80/xyz" "http://"                   
[3] "http"                       "stat.umn.edu"              
[5] ":80"                        "80"                        
[7] "/xyz"

## Element 3 is the protocol, 4 is the host, 6 is the port, and 7
## is the path.  We can use this to make a function for extracting the
## parts of a URL:
URL_parts <- function(x) {
    m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
    parts <- do.call(rbind,
                     lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L)))
    colnames(parts) <- c("protocol","host","port","path")
    parts
}
URL_parts(x)

     protocol host           port path  
[1,] "http"   "stat.umn.edu" "80" "/xyz"

## gregexec() may match multiple times within a single string.
pattern <- "([[:alpha:]]+)([[:digit:]]+)"
s <- "Test: A1 BC23 DEF456"
m <- gregexec(pattern, s)
m

[[1]]
     [,1] [,2] [,3]
[1,]    7   10   15
[2,]    7   10   15
[3,]    8   12   18
attr(,"match.length")
     [,1] [,2] [,3]
[1,]    2    4    6
[2,]    1    2    3
[3,]    1    2    3
attr(,"useBytes")
[1] TRUE
attr(,"index.type")
[1] "chars"

regmatches(s, m)

[[1]]
     [,1] [,2]   [,3]    
[1,] "A1" "BC23" "DEF456"
[2,] "A"  "BC"   "DEF"   
[3,] "1"  "23"   "456"

## Before gregexec() was implemented, one could emulate it by running
## regexec() on the regmatches obtained via gregexpr().  E.g.:
lapply(regmatches(s, gregexpr(pattern, s)),
       function(e) regmatches(e, regexec(pattern, e)))

[[1]]
[[1]][[1]]
[1] "A1" "A"  "1" 

[[1]][[2]]
[1] "BC23" "BC"   "23"  

[[1]][[3]]
[1] "DEF456" "DEF"    "456"

Examples for 'base::grep'

Pattern Matching and Replacement