# Encryption and decryption chartr("A-Ma-mN-Zn-z","N-Zn-zA-Ma-m","The Quick Red Fox Jumps Over The Lazy Brown Dog") # "Gur Dhvpx Erq Sbk Whzcf Bire Gur Ynml Oebja Qbt" chartr("N-Zn-zA-Ma-m","A-Ma-mN-Zn-z","Gur Dhvpx Erq Sbk Whzcf Bire Gur Ynml Oebja Qbt") # "The Quick Red Fox Jumps Over The Lazy Brown Dog"
Paste
Concatenate vectors after converting to character.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
paste("Today is", weekdays(Sys.time())) # "Today is 星期四" paste(1:10) #== as.character(1:10) # "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" paste(letters, collapse = "-") # "a-b-c-d-e-f-g-h-i-j-k-l-m-n-o-p-q-r-s-t-u-v-w-x-y-z" paste(letters, collapse = "") # "abcdefghijklmnopqrstuvwxyz" (nth <- paste0(1:12, c("st", "nd", "rd", rep("th", 9)),collapse = "~")) # "1st~2nd~3rd~4th~5th~6th~7th~8th~9th~10th~11th~12th" x<-1:26 y<-c(LETTERS) paste0(x, y, collapse = ':') # paste0 means paste(str, sep="") # "1A:2B:3C:4D:5E:...26Z" paste(x, y, sep='-', collapse = ':') # "1-A:2-B:3-C:4-D:5-E...26-Z" paste("y", paste(letters[1:5], collapse=" + "), sep=" ~ ") # "y ~ a + b + c + d + e"
**Tips: ** Its not what we usually did in python, in R, when you hit “abc”[1], the result will be “abc” itself, if you hit “abc”[1:2], the NA will be introduced, voila, “abc” NA output shows up.
[:cntrl:]
Control characters. In ASCII, these characters have octal codes
000 through 037, and 177 (`DEL’). In other character sets, these
are the equivalent characters, if any.
[:digit:]
Digits: `0 1 2 3 4 5 6 7 8 9’.
[:graph:]
Graphical characters: [:alnum:] and [:punct:].
[:lower:]
Lower-case letters: `a b c d e f g h i j k l m n o p q r s t u v w
x y z’.
[:print:]
Printable characters: [:alnum:], [:punct:], and whitespace.
[:space:]
Space characters: tab, newline, vertical tab, form feed, carriage
return, and space.(basically equivalent to \s)
[:upper:]
Upper-case letters: A B C D E F G H I J K L M N O P Q R S T U V W
X Y Z.
[:xdigit:]
Hexadecimal digits: 0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f
These all go inside the [] for character classes, i.e. [[:digit:]AX] matches all digits, A, and X.
You can also using Unicode properties, like [\p{Letter}], and various set operations, like [\p{Letter}–\p{script=latin}]. See ?"stringi-search-charclass"for details.
str_length("abc") str_count("abc") nchar("abc") # 3 x <- c("abcdef", "ghifjk") # The 3rd letter str_sub(x, 3, 3) substr(x,3,3) # "c" "i" ## The 2nd to 2nd-to-last character str_sub(x, 2, -2) substr(x,2,nchar(x)-1) # "bcde" "hifj" substr(x,3,3) <- "X" str_sub(x, 3, 3) <- "X" x # "abXdef" "ghXfjk"
Whitespaces
1 2 3 4 5 6 7 8 9 10 11 12 13
x <- c("abc", "defghi") str_pad(x, 10) # default pads on left unname(sapply(x, function(.){paste0(paste(rep(" ",10-nchar(.)),collapse=""),.)})) str_pad(x, 10, "both") # [1] " abc" " defghi" # [1] " abc" " defghi" # [1] " abc " " defghi " str_pad(x, 4) # " abc" "defghi" x <- c(" Short", "This is a long string") x %>% str_trunc(25) %>% str_pad(25, "right") # " Short " # "This is a long string "
## removes leading and trailing whitespace x <- c(" a ", "b ", " c") str_trim(x) str_trim(x, "left") # [1] "a" "b" "c" # [1] "a " "b " "c" ## jabberwocky <- str_c( "Twas brillig, and the slithy toves ", "did gyre and gimble in the wabe: ", "All mimsy were the borogoves, ", "and the mome raths outgrabe. " ) ## modify existing whitespace in order to wrap a paragraph of text cat(str_wrap(jabberwocky, width = 40),"\n") cat(paste(strwrap(jabberwocky, width = 40),collapse = "\n")) # Twas brillig, and the slithy toves did # gyre and gimble in the wabe: All mimsy # were the borogoves, and the mome raths # outgrabe. # Twas brillig, and the slithy toves did # gyre and gimble in the wabe: All mimsy # were the borogoves, and the mome raths # outgrabe.
Locale sensitive
The locale always defaults to English to ensure that the default behaviour is identical across systems. Locales always include a two letter ISO-639-1 language code (like “en” for English or “zh” for Chinese), and optionally a ISO-3166 country code (like “en_UK” vs “en_US”). You can see a complete list of available locales by running stringi::stri_locale_list().
x <- "I like horses." str_to_upper(x) str_to_title(x) str_to_lower(x) ## Turkish has two sorts of i: with and without the dot str_to_lower(x, "tr") # [1] "I LIKE HORSES." # [1] "I Like Horses." # [1] "i like horses." # [1] "ı like horses."
## String ordering and sorting x <- c("y", "i", "k") str_order(x) str_sort(x) # In Lithuanian, y comes between i and k str_sort(x, locale = "lt") # [1] 2 3 1 # [1] "i" "k" "y" # [1] "i" "y" "k"
Engines
There are four main engines that stringr can use to describe patterns:
Regular expressions, the default, as shown above, and described in vignette("regular-expressions").
Fixed bytewise matching, with fixed().
Locale-sensitive character matching, with coll()
Text boundary analysis with boundary().
1 2 3 4 5 6 7 8 9 10 11
## fixed(x) only matches the exact sequence of bytes specified by `x`. a1 <- "\u00e1" a2 <- "a\u0301" c(a1, a2) a1 == a2 # [1] "á" "á" # [1] FALSE str_detect(a1, fixed(a2)) str_detect(a1, coll(a2)) # [1] FALSE # [1] TRUE
1 2 3 4 5 6
i <- c("I", "İ", "i", "ı") i ## The downside of `coll()` is slow speed ## str_subset(i, coll("i", ignore_case = TRUE)) str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))
1 2 3 4 5 6 7 8 9 10 11 12 13 14
## boundary(` matches boundaries between characters, lines, sentences or words ## "" is treated as `boundary("character") x <- '2B or not 2B, that is a ? "Your future depends on your dreams." So go to sleep.' str_split(x, boundary("word")) str_count(x, boundary("word")) str_extract_all(x, boundary("word")) # [[1]] # [1] "2B" "or" "not" "2B" "that" "is" "a" "Your" "future" "depends" "on" # [12] "your" "dreams" "So" "go" "to" "sleep" # # [1] 17 # [[1]] # [1] "2B" "or" "not" "2B" "that" "is" "a" "Your" "future" "depends" "on" # [12] "your" "dreams" "So" "go" "to" "sleep"
Pattern matching
Each pattern matching function has the same first two arguments, a character vector of strings to process and a single pattern to match. stringr provides pattern matching functions to detect, locate, extract, match, replace, and split strings.
## default is equivalent to wrapping it in a call to `regex()` ## perform a case-insensitive using `ignore_case = TRUE` ## including `\n`, by setting `dotall = TRUE` str_detect("\nX\n", ".X.") str_detect("\nx\n", regex(".X.", ignore_case = TRUE, dotall = TRUE)) # [1] FALSE # [1] TRUE ## `\` is used as an escape character in regular expressions ## `\\` is used to create regular expressions x <- "a\\b" writeLines(x) str_extract(x, "\\\\") # a\b # [1] "\\" ## An alternative quoting mechanism is `\Q...\E`: all the characters in `...` are treated as exact matches. x <- c("a.b.c.d", "aeb") starts_with <- "a.b"
str_match(strings, email) # regmatches() str_match_all(strings, email) # [,1] [,2] [,3] [,4] # [1,] "apple_juice@orange.com" NA NA NA # [2,] NA NA NA NA # [3,] "fantasy.design@literature.com" ".design" NA NA # [4,] "monsoon@season.sea.sky.com" NA ".sky" NA # [[1]] # [,1] [,2] [,3] [,4] # [1,] "apple_juice@orange.com" NA NA NA # # [[2]] # [,1] [,2] [,3] [,4] # # [[3]] # [,1] [,2] [,3] [,4] # [1,] "fantasy.design@literature.com" ".design" NA NA # # [[4]] # [,1] [,2] [,3] [,4] # [1,] "monsoon@season.sea.sky.com" NA ".sky" NA
str_split("a-b-c-d-e", "-") str_split_fixed("a-b-c-d-e", "-", n = 3) # [[1]] # [1] "a" "b" "c" "d" "e" # # [,1] [,2] [,3] # [1,] "a" "b" "c-d-e"
Advanced Pattern matching
Individual unicode characters in five ways:
\xhh: 2 hex digits.
\x{hhhh}: 1-6 hex digits.
\uhhhh: 4 hex digits.
\Uhhhhhhhh: 8 hex digits.
\N{name}, e.g. \N{grinning face} matches the basic smiling emoji.
Specify common control characters:
\a: bell.
\cX: match a control-X character.
\e: escape (\u001B).
\f: form feed (\u000C).
\n: line feed (\u000A).
\r: carriage return (\u000D).
\t: horizontal tabulation (\u0009).
\0ooo match an octal character. ‘ooo’ is from one to three octal digits,
from 000 to 0377. The leading zero is required.
\X - which matches a grapheme cluster, a set of individual elements that form a single symbol. For example, one way of representing “á” is as the letter “a” plus an accent: . will match the component “a”, while \X will match the complete symbol
\d - matches any digit. The complement, \D, matches any character that is not a decimal digit.
\s - matches any whitespace. This includes tabs, newlines, form feeds, and any character in the Unicode Z Category (which includes a variety of space characters and other separators.). The complement, \S, matches any non-whitespace character.
\p{property name} - matches any character with specific unicode property, like \p{Uppercase} or \p{Diacritic}. The complement, \P{property name}, matches all characters without the property. A complete list of unicode properties can be found at http://www.unicode.org/reports/tr44/#Property_Index.
\w - matches any “word” character, which includes alphabetic characters, marks and decimal numbers. The complement, \W, matches any non-word character.
\b - matches word boundaries, the transition between word and non-word characters. \B matches the opposite: boundaries that have either both word or non-word characters on either side.
x <- "a\u0301" str_extract(x, ".") # [1] "a" str_extract(x, "\\X") # [1] "á" # [1] "a<U+0301>"
## Chinese characters str_extract("moon月sun", "[\u4e00-\u9fa5]") # [1] "月" ## \d includes any character in the Unicode Category of Nd (“Number, Decimal Digit”) ## Some Laotian numbers str_detect("១២៣", "\\d") # TRUE
## Finding all fruits that have a repeated pair of letters pattern <- "(..)\\1" fruit %>% str_subset(pattern) # [1] "banana" "coconut" "cucumber" "jujube" "papaya" # [6] "salal berry"
## Using (?:...), the non-grouping parentheses, to control precedence but not capture the match in a group. str_match(c("grey", "gray"), "gr(e|a)y") #> [,1] [,2] #> [1,] "grey" "e" #> [2,] "gray" "a" str_match(c("grey", "gray"), "gr(?:e|a)y") # [,1] # [1,] "grey" # [2,] "gray"
To match a literal “$” or “^”, you need to escape them, \$, and \^.
For multiline strings, you can use regex(multiline = TRUE). This changes the behaviour of ^ and $, and introduces three new operators:
^ now matches the start of each line.
$ now matches the end of each line.
\A matches the start of the input.
\z matches the end of the input.
\Z matches the end of the input, but before the final line terminator,
if it exists.
Putting a ? after these belows to make them “lazy” , putting a + after them to make the matches possessive:
??: 0 or 1, prefer 0.
+?: 1 or more, match as few times as possible.
*?: 0 or more, match as few times as possible.
{n,}?: n or more, match as few times as possible.
{n,m}?: between n and m, , match as few times as possible, but at least n.
?+: 0 or 1, possessive.
++: 1 or more, possessive.
*+: 0 or more, possessive.
{n}+: exactly n, possessive.
{n,}+: n or more, possessive.
{n,m}+: between n and m, possessive.
1 2 3 4 5
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII" str_extract(x, c("C{2,3}", "C{2,3}?")) # [1] "CCC" "CC" str_extract(x, c("C[LX]+", "C[LX]+?")) # [1] "CLXXX" "CL"
A related concept is the atomic-match parenthesis, (?>...). If a later match fails and the engine needs to back-track, an atomic match is kept as is: it succeeds or fails as a whole.
1 2 3 4 5 6
## The atomic match fails because it matches A, and then the next character is a C so it fails. str_detect("ABC", "(?>A|.B)C") # [1] FALSE ## The regular match succeeds because it matches A, but then C doesn’t match, so it back-tracks and tries B instead str_detect("ABC", "(?:A|.B)C") # [1] TRUE
These assertions look ahead or behind the current match without “consuming” any characters (i.e. changing the input position).
(?=...): positive look-ahead assertion. Matches if ... matches at the
current input.
(?!...): negative look-ahead assertion. Matches if ...does not
match at the current input.
(?<=...): positive look-behind assertion. Matches if ... matches text
preceding the current position, with the last character of the match
being the character just before the current position. Length must be bounded
(i.e. no * or +).
(?<!...): negative look-behind assertion. Matches if ...does not
match text preceding the current position. Length must be bounded
(i.e. no * or +).
These are useful when you want to check that a pattern exists, but you don’t want to include it in the result:
1 2 3 4 5 6 7
x <- c("1 piece", "2 pieces", "3") str_extract(x, "\\d+(?= pieces?)") # [1] "1" "2" NA
y <- c("100", "$400") str_extract(y, "(?<=\\$)\\d+(?#this is a comment)") # [1] NA "400"
Using regex(comments = TRUE). This form ignores spaces and newlines, and anything everything after #.
1 2 3 4 5 6 7 8 9 10 11 12 13
phone <- regex(" ## match american phone number \\(? # optional opening parens (\\d{3}) # area code [)- ]? # optional closing parens, dash, or space (\\d{3}) # another three numbers [ -]? # optional space or dash (\\d{3}) # three more numbers ", comments = TRUE)