#Regular Expression Exercise #R has various functions for regular expression based match and replaces. The grep, grepl, regexpr and gregexpr functions are used for searching for matches, while sub and gsub for performing replacement. # grep(value = FALSE) returns an integer vector of the indices of the elements # of x that yielded a match (or not, for invert = TRUE). # See the differences in the following examples: str <- c("Regular", "expression", "examples of R language") x <- grep("ex",str,value=F) x x <- grep("ex",str,value=T) x x <- grep("ex",str,value=F, invert = T) x # grepl returns a logical vector (match or not for each element of x). x <- grepl("ex",str) x # sub and gsub return a character vector of the same length and with the same attributes as x (after possible coercion to character). # Elements of character vectors x which are not substituted will be returned unchanged (including any declared encoding). # If useBytes = FALSE a non-ASCII substituted result will often be in UTF-8 with a marked encoding (e.g. if there is a UTF-8 input, and in a multibyte locale unless fixed = TRUE). str <- c("Regular", "expression", "examples of R language") x <- sub("x.ress","",str) # think about why the solution looks like that x x <- sub("x.+g","",str) # think about why the solution looks like that x x <- "line 4322: He is now 25 years old, and weights 130lbs"; x <- gsub("[[:digit:]]","",x) x x <- "line 4322: He is now 25 years old, and weights 130lbs"; x <- gsub("\\d+","",x) x # regexpr returns an integer vector of the same length as text giving # the starting position of the first match or -1 if there is none, # with attribute "match.length", an integer vector giving the length of # the matched text (or -1 for no match). # The match positions and lengths are in characters unless useBytes = TRUE is used, # when they are in bytes. str <- c("Regular", "expression", "examples of R language") x <- regexpr("x*ress",str) x x <- regexpr("x*ress",str, useBytes = FALSE) x #grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE, # fixed = FALSE, useBytes = FALSE, invert = FALSE) #grepl(pattern, x, ignore.case = FALSE, perl = FALSE, # fixed = FALSE, useBytes = FALSE) #sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, # fixed = FALSE, useBytes = FALSE) #gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, # fixed = FALSE, useBytes = FALSE) #More complicated Examples txt <- c("arm","foot","lefroo", "bafoobar") if(length(i <- grep("foo", txt))) cat("'foo' appears at least once in\n\t", txt, "\n") i # 2 and 4 txt[i] txt <- c("The", "licenses", "for", "most", "software", "are", "designed", "to", "take", "away", "your", "freedom", "to", "share", "and", "change", "it.", "", "By", "contrast,", "the", "GNU", "General", "Public", "License", "is", "intended", "to", "guarantee", "your", "freedom", "to", "share", "and", "change", "free", "software", "--", "to", "make", "sure", "the", "software", "is", "free", "for", "all", "its", "users") # find the words with g or u ( i <- grep("[gu]", txt) ) # indices ## Note that in locales such as en_US this includes B as the ## collation order is aAbBcCdEe ... (ot <- sub("[b-e]",".", txt)) (ot <- gsub("[b-e]",".", txt)) # check the difference between previous 2 commands txt[ot != sub("[b-e]",".", txt)] txt[gsub("g","#", txt) != gsub("g","#", txt, ignore.case = TRUE)] # the "G" words ## trim trailing white space str <- "Now is the time " sub(" +$", "", str) ## spaces only ## what is considered 'white space' depends on the locale. sub("[[:space:]]+$", "", str) ## white space, POSIX-style ## what PCRE considered white space changed in version 8.34: see ?regex sub("\\s+$", "", str, perl = TRUE) # ultra complicated ## capitalizing txt <- "a test of capitalizing" gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE) gsub("\\b(\\w)", "\\U\\1", txt, perl=TRUE) txt2 <- "useRs may fly into JFK or laGuardia" gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE) sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE) ## Decompose a URL into its components. ## Example by LT (http://www.cs.uiowa.edu/~luke/R/regexp.html). x <- "http://stat.umn.edu:80/xyz" m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x) m regmatches(x, m) ## Element 3 is the protocol, 4 is the host, 6 is the port, and 7 ## is the path. We can use this to make a function for extracting the ## parts of a URL: URL_parts <- function(x) { m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x) parts <- do.call(rbind, lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L))) colnames(parts) <- c("protocol","host","port","path") parts } URL_parts(x) #============Exercises============== # 2 examples of HTML code text <- c("
A regular expression (regex or regexp for short) is a special text string for describing a serach pattern.
", "