A string is created by entering text between two single or double quotation marks.

Say, A string of similar strings

R

Basic

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
print("hello world")
print("hello world",quote = F)
cat("hello world","\n")
y<-c('what are you doing',NA,'I am Using R!',23333)
nchar(y)
# 18 NA 13 5
length(y)
# 4
rev(y)
# "23333" "I am Using R!" NA "what are you doing"
y <-append(y,"Lorem Ipsum",after=2)
# "what are you doing" NA "Lorem Ipsum" "I am Using R!" "23333"
y <-prepend(y,"Lorem Ipsum",before =2)
# "what are you doing" "Lorem Ipsum" NA "Lorem Ipsum" "I am Using R!" "23333"
set.seed(0)
smpl<-sample(25) #== smpl<-sample(1:25,25)
sort(smpl,decreasing = T)
# 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
smpl[order(smpl,decreasing = T)]
# 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1
sort(sample(LETTERS,10,replace = T),decreasing = T)
# "Y" "X" "X" "X" "R" "Q" "O" "J" "G" "F"

Text-transform

You may know text-tranform in css that is these three attributes uppercase, capitalize and owercase, in R its much different.

1
2
3
4
5
6
7
8
9
10
11
dna<-'ATTGCtacgGACGTTttAActga'
tolower(dna)
casefold(dna,upper = F)
# "attgctacggacgttttaactga"
toupper(dna)
casefold(dna,upper = T)
# "ATTGCTACGGACGTTTTAACTGA"
chartr("TtAaCcGg","AATTGGCC",dna) # Complement DNA
# "TAACGATGCCTGCAAAATTGACT"
paste(rev(unlist(strsplit(chartr("TtAaCcGg","AATTGGCC",dna),""))),collapse = "") # Reverse Complement DNA
# "TCAGTTAAAACGTCCGTAGCAAT"
1
2
3
4
5
# Encryption and decryption
chartr("A-Ma-mN-Zn-z","N-Zn-zA-Ma-m","The Quick Red Fox Jumps Over The Lazy Brown Dog")
# "Gur Dhvpx Erq Sbk Whzcf Bire Gur Ynml Oebja Qbt"
chartr("N-Zn-zA-Ma-m","A-Ma-mN-Zn-z","Gur Dhvpx Erq Sbk Whzcf Bire Gur Ynml Oebja Qbt")
# "The Quick Red Fox Jumps Over The Lazy Brown Dog"

Paste

Concatenate vectors after converting to character.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
paste("Today is", weekdays(Sys.time()))
# "Today is 星期四"
paste(1:10) #== as.character(1:10)
# "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
paste(letters, collapse = "-")
# "a-b-c-d-e-f-g-h-i-j-k-l-m-n-o-p-q-r-s-t-u-v-w-x-y-z"
paste(letters, collapse = "")
# "abcdefghijklmnopqrstuvwxyz"
(nth <- paste0(1:12, c("st", "nd", "rd", rep("th", 9)),collapse = "~"))
# "1st~2nd~3rd~4th~5th~6th~7th~8th~9th~10th~11th~12th"
x<-1:26
y<-c(LETTERS)
paste0(x, y, collapse = ':')
# paste0 means paste(str, sep="")
# "1A:2B:3C:4D:5E:...26Z"
paste(x, y, sep='-', collapse = ':')
# "1-A:2-B:3-C:4-D:5-E...26-Z"
paste("y", paste(letters[1:5], collapse=" + "), sep=" ~ ")
# "y ~ a + b + c + d + e"

Slicing

1
2
3
4
5
6
substr(paste(LETTERS,collapse = ""), 2, 4) 
# "BCD"
substring(paste(LETTERS,collapse = ""), 1:10, 1:5)
# "A" "B" "C" "D" "E" "" "" "" "" ""
substring(paste0(letters,collapse = ""),seq(1,25,2),seq(2,26,2))
# "ab" "cd" "ef" "gh" "ij" "kl" "mn" "op" "qr" "st" "uv" "wx" "yz"

**Tips: ** Its not what we usually did in python, in R, when you hit “abc”[1], the result will be “abc” itself, if you hit “abc”[1:2], the NA will be introduced, voila, “abc” NA output shows up.

1
2
3
4
5
6
7
8
9
moon <- c("Moon goddess Cynthia", "Lunasky", "Luagrape", "ムーン", "月亮,着凉了黑夜")
substr(moon, 2, 5)
substring(moon, 2, 3:5)
substring(moon,c(6,1,4,1,7), c(12,4,8,3,8))
# "oon " "unas" "uagr" "ーン" "亮,着凉"
# "oo" "una" "uagr" "ーン" "亮,着"
# "goddess" "Luna" "grape" "ムーン" "黑夜"
substring(moon,c(6,1,4,1,7), c(12,4,8,3,8)) <- "???"
# "Moon ???dess Cynthia" "???asky" "Lua???pe" "???" "月亮,着凉了??"

Split

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
text1<-"One word is too often profaned\nFor me to profane it\nOne feeling too falsely distain'd\n"
text2 <- "For thee to distain it\nOne hope is too like despair\nFor prudence to smother"
text <- paste0(text1,text2)
strsplit(text,"")
# [[1]]
# [1] "O" "n" "e" " " "w" "o" "r" "d" " " "i" "s" " " "t" "o" "o" " " "o" "f"
# ...
# [163] "e" "r"
strsplit(text,"\n")
# [[1]]
# [1] "One word is too often profaned"
# [2] "For me to profane it"
# [3] "One feeling too falsely distain'd"
# [4] "For thee to distain it"
# [5] "One hope is too like despair"
# [6] "For prudence to smother"
split(1:10,1:5)
# $`1`
# [1] 1 6
#
# $`2`
# [1] 2 7
#
# $`3`
# [1] 3 8
#
# $`4`
# [1] 4 9
#
# $`5`
# [1] 5 10
set.seed(1995)
s <- strsplit(text, "\\s")[[1]]
split(s,sample(length(s)/5,length(s),replace = T))
# $`1`
# [1] "too" "thee" "prudence"
#
# $`2`
# [1] "too" "like" "For"
#
# $`3`
# [1] "is" "it" "For" "hope" "smother"
#
# $`4`
# [1] "often" "me" "profane" "One" "feeling" "falsely" "it"
# [8] "is" "too"
#
# $`5`
# [1] "profaned" "For" "distain"
#
# $`6`
# [1] "One" "word" "to" "distain'd" "to"
# [6] "One" "despair"
#
# $`7`
# [1] "to"

Regular Expression

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# ?regex
# g stands for gloabal replacement or substitution while l represents logic(boolean) output
text <- c("Hello, Hello, Anyone Here! Hi, Annabelle!", "How are you, Annabelle, Annabelle?","Hello, Annabelle is here.")
sub(pattern = 'Hello',replacement = 'Hi',x = text)
# [1] "Hi, Hello, Anyone Here! Hi, Annabelle!"
# [2] "How are you, Annabelle, Annabelle?"
# [3] "Hi, Annabelle is here."
gsub('Hello','Hi',text)
# [1] "Hi, Hi, Anyone Here! Hi, Annabelle!"
# [2] "How are you, Annabelle, Annabelle?"
# [3] "Hi, Annabelle is here."
gsub('Annabelle','\\1_\\1_',text)
# [1] "Hello, Hello, Anyone Here! Hi, __!"
# [2] "How are you, __, __?"
# [3] "Hello, __ is here."
gsub('(Annabelle)','\\1_\\1_',text)
# [1] "Hello, Hello, Anyone Here! Hi, Annabelle_Annabelle_!"
# [2] "How are you, Annabelle_Annabelle_, Annabelle_Annabelle_?"
# [3] "Hello, Annabelle_Annabelle_ is here."
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
grep(pattern = "Hello",text) # if not matched return integer(0) 
# return indices
# 1 3
grep("Hello",text,value = T)
# # return values
# [1] "Hello, Hello, Anyone Here! Hi, Annabelle!"
# [2] "Hello, Annabelle is here."
grepl(pattern = "Hello",text)
# TRUE FALSE TRUE
regexpr(pattern = 'Anna',text ) # if not matched return -1
# [1] 32 14 8
# attr(,"match.length")
# [1] 4 4 4
gregexpr(pattern = 'Anna',text) # its a list
# [[1]]
# [1] 32
# [[2]]
# [1] 14 25
# [[3]]
# [1] 8
regexec(pattern = 'Anna',text)
# its a list too, but matches only once just same as regexpr
# [[1]]
# [1] 32
# [[2]]
# [1] 14
# [[3]]
# 8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
ggrep <- function(pat, txt, ignore.case = FALSE){
if(len <- length(i <- grep(pat,txt,ignore.case = ignore.case))){
cat(pat, "appears", len,"times","\n")
cat("Found in:", "\n",paste(txt[i],collapse = "\n"),"\n")
return(i)
}
else{
cat("Not Found.","\n")
return(0)
}
}
# Hello appears 2 times
# Found in:
# Hello, Hello, Anyone Here! Hi, Annabelle!
# Hello, Annabelle is here.
# [1] 1 3

Advanced RE

There are a number of pre-built classes that you can use inside []:

  • [:alnum:]
    Alphanumeric characters: [:alpha:]and [:digit:].
  • [:alpha:]
    Alphabetic characters: [:lower:]and [:upper:].
  • [:blank:]
    Blank characters: space and tab.
  • [:cntrl:]
    Control characters. In ASCII, these characters have octal codes
    000 through 037, and 177 (`DEL’). In other character sets, these
    are the equivalent characters, if any.
  • [:digit:]
    Digits: `0 1 2 3 4 5 6 7 8 9’.
  • [:graph:]
    Graphical characters: [:alnum:] and [:punct:].
  • [:lower:]
    Lower-case letters: `a b c d e f g h i j k l m n o p q r s t u v w
    x y z’.
  • [:print:]
    Printable characters: [:alnum:], [:punct:], and whitespace.
  • [:punct:]
    Punctuation characters: ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ~.
  • [:space:]
    Space characters: tab, newline, vertical tab, form feed, carriage
    return, and space.(basically equivalent to \s)
  • [:upper:]
    Upper-case letters: A B C D E F G H I J K L M N O P Q R S T U V W
    X Y Z.
  • [:xdigit:]
    Hexadecimal digits: 0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f

These all go inside the [] for character classes, i.e. [[:digit:]AX] matches all digits, A, and X.

You can also using Unicode properties, like [\p{Letter}], and various set operations, like [\p{Letter}–\p{script=latin}]. See ?"stringi-search-charclass"for details.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
## trim leading and trailing white space
gsub("^ +| +$", "", " str ")
gsub("^[[:space:]]+|[[:space:]]+$", "", " str ")
## PCRE - Perl Compatible Regular Expressions
gsub("^\\s+|\\s+$", "", " str ", perl = TRUE)
# str

## capitalizing
gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", text, perl=TRUE)
gsub("\\b(\\w)","\\U\\1", text, perl=TRUE)
# [1] "Hello, Hello, Anyone Here! Hi, Annabelle!"
# [2] "How Are You, Annabelle, Annabelle?"
# [3] "Hello, Annabelle Is Here."

gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", text, perl=TRUE)
# [1] "HellO, HellO, AnyonE HerE! HI, AnnabellE!"
# [2] "HoW ArE YoU, AnnabellE, AnnabellE?"
# [3] "HellO, AnnabellE IS HerE."
sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", text, perl=TRUE)
# [1] "HellO, Hello, Anyone Here! Hi, Annabelle!"
# [2] "HoW are you, Annabelle, Annabelle?"
# [3] "HellO, Annabelle is here."

## named capture
pr <- c("Sigur Rós Ólafur Arnalds Olafur Arnalds","As the Stars Fall Mono","Mogwai God Is An Astronaut")
name.rex <- "(?<first>[[:upper:]][[:lower:]]+) (?<last>[[:upper:]][[:lower:]]+)"
regexpr(name.rex, pr, perl = TRUE)
# [1] 18 8 1
# attr(,"match.length")
# [1] 14 10 10
# attr(,"capture.start")
# first last
# [1,] 18 26
# [2,] 8 14
# [3,] 1 8
# attr(,"capture.length")
# first last
# [1,] 7 6
# [2,] 5 4
# [3,] 6 3
# attr(,"capture.names")
# [1] "first" "last"
parse.one <- function(res, result) {
m <- do.call(rbind, lapply(seq_along(res), function(i) {
if(result[i] == -1) return("")
st <- attr(result, "capture.start")[i, ]
substring(res[i], st, st + attr(result, "capture.length")[i, ] - 1)
}))
colnames(m) <- attr(result, "capture.names")
m
}
parse.one(pr, regexpr(name.rex, pr, perl = TRUE))
# first last
# [1,] "Arnalds" "Olafur"
# [2,] "Stars" "Fall"
# [3,] "Mogwai" "God"
## Using built in function
regmatches(pr, regexpr(name.rex, pr, perl = TRUE))
[1] "Arnalds Olafur" "Stars Fall" "Mogwai God"

## parsing URL
URL_parts <- function(x) {
m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?((/.*)\\?)?(([^#]+))?(#(.*)+)?", x)
parts <- do.call(rbind,lapply(regmatches(x, m), `[`,c(3L,4L,6L,8L,10L,12L)))
colnames(parts) <- c("protocal","host","port","path","query","fragment")
parts
}
x="http://www.xiami.com/play?ids=/song/playlist/id/1/type/9#loaded"
URL_parts(x)
# protocal host port path query fragment
# [1,] "http" "www.xiami.com" "" "/play" "ids=/song/playlist/id/1/type/9" "loaded"
y="https://github.com:8080/username?tab=repositories"
URL_parts(y)
# protocal host port path query fragment
# [1,] "https" "github.com" "8080" "/username" "tab=repositories" ""

Match

1
2
3
4
5
6
7
8
9
10
11
misc = c(LETTERS,letters,month.abb,month.name)
match(x = 'May',misc)
# 57
# Why 26*2+5

# compeletely matched
pmatch(c("M", "J","MJ","May","Ma"), misc)
# 13 10 NA 57 NA
# # uniquely matched
charmatch(c("M", "J","MJ","May","Ma","Jul"), misc)
# 13 10 NA 0 0 59

Misc

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
strtrim(c("ggplot2", "tidyr", "dplyr"), c(1,5,2)) 
#== substring(c("ggplot2", "tidyr", "dplyr"),1,c(1,5,2))
# "g" "tidyr" "dp"
strwrap("\t\n ggplot2 tidyr \n\t dplyr\t\n", prefix = "R packages:")
# "R package:ggplot2 tidyr dplyr"
strwrap("\t\n ggplot2 tidyr \n\t dplyr\t\n", width = 1, prefix = "R package:")
# "R package:ggplot2" "R package:tidyr" "R package:dplyr"
## distances
adist_special <- function(word1, word2){
min(adist(word1, word2),
adist(word1, gsub(word2,
pattern = "(.*),(.*)",
repl="\\2,\\1")))
}
adist("hello,world", "world,hello")
# 8
adist_special("hello,world", "world,hello")
# 0

Stringr

Introduction

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
str_length("abc")
str_count("abc")
nchar("abc")
# 3
x <- c("abcdef", "ghifjk")
# The 3rd letter
str_sub(x, 3, 3)
substr(x,3,3)
# "c" "i"
## The 2nd to 2nd-to-last character
str_sub(x, 2, -2)
substr(x,2,nchar(x)-1)
# "bcde" "hifj"
substr(x,3,3) <- "X"
str_sub(x, 3, 3) <- "X"
x
# "abXdef" "ghXfjk"

Whitespaces

1
2
3
4
5
6
7
8
9
10
11
12
13
x <- c("abc", "defghi")
str_pad(x, 10) # default pads on left
unname(sapply(x, function(.){paste0(paste(rep(" ",10-nchar(.)),collapse=""),.)}))
str_pad(x, 10, "both")
# [1] " abc" " defghi"
# [1] " abc" " defghi"
# [1] " abc " " defghi "
str_pad(x, 4)
# " abc" "defghi"
x <- c(" Short", "This is a long string")
x %>% str_trunc(25) %>% str_pad(25, "right")
# " Short "
# "This is a long string "
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
## removes leading and trailing whitespace
x <- c(" a ", "b ", " c")
str_trim(x)
str_trim(x, "left")
# [1] "a" "b" "c"
# [1] "a " "b " "c"
##
jabberwocky <- str_c(
"Twas brillig, and the slithy toves ",
"did gyre and gimble in the wabe: ",
"All mimsy were the borogoves, ",
"and the mome raths outgrabe. "
)
## modify existing whitespace in order to wrap a paragraph of text
cat(str_wrap(jabberwocky, width = 40),"\n")
cat(paste(strwrap(jabberwocky, width = 40),collapse = "\n"))
# Twas brillig, and the slithy toves did
# gyre and gimble in the wabe: All mimsy
# were the borogoves, and the mome raths
# outgrabe.
# Twas brillig, and the slithy toves did
# gyre and gimble in the wabe: All mimsy
# were the borogoves, and the mome raths
# outgrabe.

Locale sensitive

The locale always defaults to English to ensure that the default behaviour is identical across systems. Locales always include a two letter ISO-639-1 language code (like “en” for English or “zh” for Chinese), and optionally a ISO-3166 country code (like “en_UK” vs “en_US”). You can see a complete list of available locales by running stringi::stri_locale_list().

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
x <- "I like horses."
str_to_upper(x)
str_to_title(x)
str_to_lower(x)
## Turkish has two sorts of i: with and without the dot
str_to_lower(x, "tr")
# [1] "I LIKE HORSES."
# [1] "I Like Horses."
# [1] "i like horses."
# [1] "ı like horses."

## String ordering and sorting
x <- c("y", "i", "k")
str_order(x)
str_sort(x)
# In Lithuanian, y comes between i and k
str_sort(x, locale = "lt")
# [1] 2 3 1
# [1] "i" "k" "y"
# [1] "i" "y" "k"

Engines

There are four main engines that stringr can use to describe patterns:

  • Regular expressions, the default, as shown above, and described in
    vignette("regular-expressions").
  • Fixed bytewise matching, with fixed().
  • Locale-sensitive character matching, with coll()
  • Text boundary analysis with boundary().
1
2
3
4
5
6
7
8
9
10
11
## fixed(x) only matches the exact sequence of bytes specified by `x`.
a1 <- "\u00e1"
a2 <- "a\u0301"
c(a1, a2)
a1 == a2
# [1] "á" "á"
# [1] FALSE
str_detect(a1, fixed(a2))
str_detect(a1, coll(a2))
# [1] FALSE
# [1] TRUE
1
2
3
4
5
6
i <- c("I", "İ", "i", "ı")
i
## The downside of `coll()` is slow speed
##
str_subset(i, coll("i", ignore_case = TRUE))
str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
## boundary(` matches boundaries between characters, lines, sentences or words
## "" is treated as `boundary("character")
x <- '2B or not 2B, that is a ? "Your future depends on your dreams." So go to sleep.'
str_split(x, boundary("word"))
str_count(x, boundary("word"))
str_extract_all(x, boundary("word"))
# [[1]]
# [1] "2B" "or" "not" "2B" "that" "is" "a" "Your" "future" "depends" "on"
# [12] "your" "dreams" "So" "go" "to" "sleep"
#
# [1] 17
# [[1]]
# [1] "2B" "or" "not" "2B" "that" "is" "a" "Your" "future" "depends" "on"
# [12] "your" "dreams" "So" "go" "to" "sleep"

Pattern matching

Each pattern matching function has the same first two arguments, a character vector of strings to process and a single pattern to match. stringr provides pattern matching functions to detect, locate, extract, match, replace, and split strings.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
## default is equivalent to wrapping it in a call to `regex()`
## perform a case-insensitive using `ignore_case = TRUE`
## including `\n`, by setting `dotall = TRUE`
str_detect("\nX\n", ".X.")
str_detect("\nx\n", regex(".X.", ignore_case = TRUE, dotall = TRUE))
# [1] FALSE
# [1] TRUE
## `\` is used as an escape character in regular expressions
## `\\` is used to create regular expressions
x <- "a\\b"
writeLines(x)
str_extract(x, "\\\\")
# a\b
# [1] "\\"
## An alternative quoting mechanism is `\Q...\E`: all the characters in `...` are treated as exact matches.
x <- c("a.b.c.d", "aeb")
starts_with <- "a.b"

str_detect(x, paste0("^", starts_with))
str_detect(x, paste0("^\\Q", starts_with, "\\E"))
# [1] TRUE TRUE
# [1] TRUE FALSE
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
strings <- c(
"apple_juice@orange.com",
"cinderrella~@fairy.tale.com",
"fantasy.design@literature.com",
"First: rhythm.rhyme@rime.com.cn; Second: monsoon@season.sea.sky.com"
)
email <- "\\w+([-_\\.]\\w+)*@\\w+([-\\.]\\w+)*\\.\\w+([-\\.]\\w+)*$"
str_detect(strings, email) # grepl()
str_subset(strings, email) # grepl(value = TRUE)
# [1] TRUE FALSE TRUE TRUE
# [1] "apple_juice@orange.com"
# [2] "fantasy.design@literature.com"
# [3] "First: rhythm.rhyme@rime.com.cn; Second: monsoon@season.sea.sky.com"
str_count(strings, email)
# 1 0 1 1
loc <- str_locate(strings, email)
str_locate_all(strings, email)
# [[1]]
# start end
# [1,] 1 22
#
# [[2]]
# start end
#
# [[3]]
# start end
# [1,] 1 29
#
# [[4]]
# start end
# [1,] 42 67
str_extract(strings, email)
# [1] "apple_juice@orange.com" NA "fantasy.design@literature.com"
# [4] "monsoon@season.sea.sky.com"
str_extract_all(strings, email)
# [[1]]
# [1] "apple_juice@orange.com"
#
# [[2]]
# character(0)
#
# [[3]]
# [1] "fantasy.design@literature.com"
#
# [[4]]
# [1] "monsoon@season.sea.sky.com"
str_extract_all(strings, email, simplify = TRUE)
# [,1]
# [1,] "apple_juice@orange.com"
# [2,] ""
# [3,] "fantasy.design@literature.com"
# [4,] "monsoon@season.sea.sky.com"
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
str_match(strings, email) # regmatches()
str_match_all(strings, email)
# [,1] [,2] [,3] [,4]
# [1,] "apple_juice@orange.com" NA NA NA
# [2,] NA NA NA NA
# [3,] "fantasy.design@literature.com" ".design" NA NA
# [4,] "monsoon@season.sea.sky.com" NA ".sky" NA
# [[1]]
# [,1] [,2] [,3] [,4]
# [1,] "apple_juice@orange.com" NA NA NA
#
# [[2]]
# [,1] [,2] [,3] [,4]
#
# [[3]]
# [,1] [,2] [,3] [,4]
# [1,] "fantasy.design@literature.com" ".design" NA NA
#
# [[4]]
# [,1] [,2] [,3] [,4]
# [1,] "monsoon@season.sea.sky.com" NA ".sky" NA

str_replace(strings, email, "moon")
str_replace_all(strings, email, "sun")
# [1] "moon" "cinderrella~@fairy.tale.com"
# [3] "moon" "First: rhythm.rhyme@rime.com.cn; Second: moon"
# [1] "sun" "cinderrella~@fairy.tale.com"
# [3] "sun" "First: rhythm.rhyme@rime.com.cn; Second: sun"

str_split("a-b-c-d-e", "-")
str_split_fixed("a-b-c-d-e", "-", n = 3)
# [[1]]
# [1] "a" "b" "c" "d" "e"
#
# [,1] [,2] [,3]
# [1,] "a" "b" "c-d-e"

Advanced Pattern matching

Individual unicode characters in five ways:

  • \xhh: 2 hex digits.

  • \x{hhhh}: 1-6 hex digits.

  • \uhhhh: 4 hex digits.

  • \Uhhhhhhhh: 8 hex digits.

  • \N{name}, e.g. \N{grinning face} matches the basic smiling emoji.

Specify common control characters:

  • \a: bell.
  • \cX: match a control-X character.
  • \e: escape (\u001B).
  • \f: form feed (\u000C).
  • \n: line feed (\u000A).
  • \r: carriage return (\u000D).
  • \t: horizontal tabulation (\u0009).
  • \0ooo match an octal character. ‘ooo’ is from one to three octal digits,
    from 000 to 0377. The leading zero is required.
  • \X - which matches a grapheme cluster, a set of individual elements that form a single symbol. For example, one way of representing “á” is as the letter “a” plus an accent: . will match the component “a”, while \X will match the complete symbol
  • \d - matches any digit. The complement, \D, matches any character that is not a decimal digit.
  • \s - matches any whitespace. This includes tabs, newlines, form feeds, and any character in the Unicode Z Category (which includes a variety of space characters and other separators.). The complement, \S, matches any non-whitespace character.
  • \p{property name} - matches any character with specific unicode property, like \p{Uppercase} or \p{Diacritic}. The complement, \P{property name}, matches all characters without the property. A complete list of unicode properties can be found at http://www.unicode.org/reports/tr44/#Property_Index.
  • \w - matches any “word” character, which includes alphabetic characters, marks and decimal numbers. The complement, \W, matches any non-word character.
  • \b - matches word boundaries, the transition between word and non-word characters. \B matches the opposite: boundaries that have either both word or non-word characters on either side.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
x <- "a\u0301"
str_extract(x, ".")
# [1] "a"
str_extract(x, "\\X")
# [1] "á"
# [1] "a<U+0301>"

## Chinese characters
str_extract("moon月sun", "[\u4e00-\u9fa5]")
# [1] "月"
## \d includes any character in the Unicode Category of Nd (“Number, Decimal Digit”)
## Some Laotian numbers
str_detect("១២៣", "\\d")
# TRUE

(text <- "Some \t badly\n\t\tspaced \f text")
# [1] "Some \t badly\n\t\tspaced \f text"
str_replace_all(text, "\\s+", " ")
# [1] "Some badly spaced text"

(text <- c('"Double quotes"', "«Guillemet»", "“Fancy quotes”"))
str_replace_all(text, "\\p{quotation mark}", "'")
# "'Double quotes'" "'Guillemet'" "'Fancy quotes'"

str_replace_all("The quick\t\nbrown fox;jumps+over the lazy2dog", "\\b", "_")
# [1] "_The_ _quick_\t\n_brown_ _fox_;_jumps_+_over_ _the_ _lazy2dog_"
str_replace_all("The quick\t\nbrown fox;jumps+over the lazy2dog", "\\B", "_")
# [1] "T_h_e q_u_i_c_k\t_\nb_r_o_w_n _ f_o_x;j_u_m_p_s+o_v_e_r t_h_e l_a_z_y_2_d_o_g"

## Finding all fruits that have a repeated pair of letters
pattern <- "(..)\\1"
fruit %>%
str_subset(pattern)
# [1] "banana" "coconut" "cucumber" "jujube" "papaya"
# [6] "salal berry"

fruit %>%
str_subset(pattern) %>%
str_match(pattern)
# [,1] [,2]
# [1,] "anan" "an"
# [2,] "coco" "co"
# [3,] "cucu" "cu"
# [4,] "juju" "ju"
# [5,] "papa" "pa"
# [6,] "alal" "al"

## Using (?:...), the non-grouping parentheses, to control precedence but not capture the match in a group.
str_match(c("grey", "gray"), "gr(e|a)y")
#> [,1] [,2]
#> [1,] "grey" "e"
#> [2,] "gray" "a"
str_match(c("grey", "gray"), "gr(?:e|a)y")
# [,1]
# [1,] "grey"
# [2,] "gray"

To match a literal “$” or “^”, you need to escape them, \$, and \^.

For multiline strings, you can use regex(multiline = TRUE). This changes the behaviour of ^ and $, and introduces three new operators:

  • ^ now matches the start of each line.
  • $ now matches the end of each line.
  • \A matches the start of the input.
  • \z matches the end of the input.
  • \Z matches the end of the input, but before the final line terminator,
    if it exists.
1
2
3
4
5
6
7
x <- "Line 1\nLine 2\nLine 3\n"
str_extract_all(x, "^Line..")[[1]]
# [1] "Line 1"
str_extract_all(x, regex("^Line..", multiline = TRUE))[[1]]
# [1] "Line 1" "Line 2" "Line 3"
str_extract_all(x, regex("\\ALine..", multiline = TRUE))[[1]]
# [1] "Line 1"

Putting a ? after these belows to make them “lazy” , putting a + after them to make the matches possessive:

  • ??: 0 or 1, prefer 0.

  • +?: 1 or more, match as few times as possible.

  • *?: 0 or more, match as few times as possible.

  • {n,}?: n or more, match as few times as possible.

  • {n,m}?: between n and m, , match as few times as possible, but at least n.

  • ?+: 0 or 1, possessive.

  • ++: 1 or more, possessive.

  • *+: 0 or more, possessive.

  • {n}+: exactly n, possessive.

  • {n,}+: n or more, possessive.

  • {n,m}+: between n and m, possessive.

1
2
3
4
5
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_extract(x, c("C{2,3}", "C{2,3}?"))
# [1] "CCC" "CC"
str_extract(x, c("C[LX]+", "C[LX]+?"))
# [1] "CLXXX" "CL"

A related concept is the atomic-match parenthesis, (?>...). If a later match fails and the engine needs to back-track, an atomic match is kept as is: it succeeds or fails as a whole.

1
2
3
4
5
6
## The atomic match fails because it matches A, and then the next character is a C so it fails.
str_detect("ABC", "(?>A|.B)C")
# [1] FALSE
## The regular match succeeds because it matches A, but then C doesn’t match, so it back-tracks and tries B instead
str_detect("ABC", "(?:A|.B)C")
# [1] TRUE

These assertions look ahead or behind the current match without “consuming” any characters (i.e. changing the input position).

  • (?=...): positive look-ahead assertion. Matches if ... matches at the
    current input.
  • (?!...): negative look-ahead assertion. Matches if ... does not
    match at the current input.
  • (?<=...): positive look-behind assertion. Matches if ... matches text
    preceding the current position, with the last character of the match
    being the character just before the current position. Length must be bounded
    (i.e. no * or +).
  • (?<!...): negative look-behind assertion. Matches if ... does not
    match text preceding the current position. Length must be bounded
    (i.e. no * or +).

These are useful when you want to check that a pattern exists, but you don’t want to include it in the result:

1
2
3
4
5
6
7
x <- c("1 piece", "2 pieces", "3")
str_extract(x, "\\d+(?= pieces?)")
# [1] "1" "2" NA

y <- c("100", "$400")
str_extract(y, "(?<=\\$)\\d+(?#this is a comment)")
# [1] NA "400"

Using regex(comments = TRUE). This form ignores spaces and newlines, and anything everything after #.

1
2
3
4
5
6
7
8
9
10
11
12
13
phone <- regex("
## match american phone number
\\(? # optional opening parens
(\\d{3}) # area code
[)- ]? # optional closing parens, dash, or space
(\\d{3}) # another three numbers
[ -]? # optional space or dash
(\\d{3}) # three more numbers
", comments = TRUE)

str_match("514-791-8141", phone)
# [,1] [,2] [,3] [,4]
# [1,] "514-791-814" "514" "791" "814"

REFERENCES