#Regular Expression Exercise

#R has various functions for regular expression based match and replaces. The grep, grepl, regexpr and gregexpr functions are used for searching for matches, while sub and gsub for performing replacement.

# grep(value = FALSE) returns an integer vector of the indices of the elements 
# of x that yielded a match (or not, for invert = TRUE).
# See the differences in the following examples:
str <- c("Regular", "expression", "examples of R language")
x <- grep("ex",str,value=F)
x
x <- grep("ex",str,value=T)
x
x <- grep("ex",str,value=F, invert = T)
x


# grepl returns a logical vector (match or not for each element of x).
x <- grepl("ex",str)
x


# sub and gsub return a character vector of the same length and with the same attributes as x (after possible coercion to character). 
# Elements of character vectors x which are not substituted will be returned unchanged (including any declared encoding). 
# If useBytes = FALSE a non-ASCII substituted result will often be in UTF-8 with a marked encoding (e.g. if there is a UTF-8 input, and in a multibyte locale unless fixed = TRUE).
str <- c("Regular", "expression", "examples of R language")
x <- sub("x.ress","",str)
# think about why the solution looks like that
x

x <- sub("x.+g","",str)
# think about why the solution looks like that
x


x <- "line 4322: He is now 25 years old, and weights 130lbs";
x <- gsub("[[:digit:]]","",x)
x


x <- "line 4322: He is now 25 years old, and weights 130lbs";
x <- gsub("\\d+","",x)
x


# regexpr returns an integer vector of the same length as text giving 
# the starting position of the first match or -1 if there is none, 
# with attribute "match.length", an integer vector giving the length of 
# the matched text (or -1 for no match). 
# The match positions and lengths are in characters unless useBytes = TRUE is used, 
# when they are in bytes.

str <- c("Regular", "expression", "examples of R language")
x <- regexpr("x*ress",str)
x
x <- regexpr("x*ress",str, useBytes = FALSE)
x


#grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE,
#       fixed = FALSE, useBytes = FALSE, invert = FALSE)

#grepl(pattern, x, ignore.case = FALSE, perl = FALSE,
#      fixed = FALSE, useBytes = FALSE)

#sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
#    fixed = FALSE, useBytes = FALSE)

#gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
#     fixed = FALSE, useBytes = FALSE)


#More complicated Examples

txt <- c("arm","foot","lefroo", "bafoobar")
if(length(i <- grep("foo", txt)))
  cat("'foo' appears at least once in\n\t", txt, "\n")
i # 2 and 4
txt[i]


txt <- c("The", "licenses", "for", "most", "software", "are",
         "designed", "to", "take", "away", "your", "freedom",
         "to", "share", "and", "change", "it.",
         "", "By", "contrast,", "the", "GNU", "General", "Public", "License",
         "is", "intended", "to", "guarantee", "your", "freedom", "to",
         "share", "and", "change", "free", "software", "--",
         "to", "make", "sure", "the", "software", "is",
         "free", "for", "all", "its", "users")
# find the words with g or u
( i <- grep("[gu]", txt) ) # indices


## Note that in locales such as en_US this includes B as the
## collation order is aAbBcCdEe ...
(ot <- sub("[b-e]",".", txt))
(ot <- gsub("[b-e]",".", txt))
# check the difference between previous 2 commands

txt[ot != sub("[b-e]",".", txt)]

txt[gsub("g","#", txt) !=
      gsub("g","#", txt, ignore.case = TRUE)] # the "G" words


## trim trailing white space
str <- "Now is the time      "
sub(" +$", "", str)  ## spaces only
## what is considered 'white space' depends on the locale.
sub("[[:space:]]+$", "", str) ## white space, POSIX-style
## what PCRE considered white space changed in version 8.34: see ?regex
sub("\\s+$", "", str, perl = TRUE) 


# ultra complicated
## capitalizing
txt <- "a test of capitalizing"
gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE)
gsub("\\b(\\w)",    "\\U\\1",       txt, perl=TRUE)

txt2 <- "useRs may fly into JFK or laGuardia"
gsub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)
sub("(\\w)(\\w*)(\\w)", "\\U\\1\\E\\2\\U\\3", txt2, perl=TRUE)


## Decompose a URL into its components.
## Example by LT (http://www.cs.uiowa.edu/~luke/R/regexp.html).
x <- "http://stat.umn.edu:80/xyz"
m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
m

regmatches(x, m)
## Element 3 is the protocol, 4 is the host, 6 is the port, and 7
## is the path.  We can use this to make a function for extracting the
## parts of a URL:
URL_parts <- function(x) {
  m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/.*)", x)
  parts <- do.call(rbind,
                   lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L)))
  colnames(parts) <- c("protocol","host","port","path")
  parts
}
URL_parts(x)


#============Exercises==============
# 2 examples of HTML code
text <- c("<P> A regular expression (regex or regexp for short) is a special text string for describing a serach pattern.</P>",
          "<H2> Regular Expression Quick Start</H2>")

# 1. What string is a heading?
# Hint - in HTML, heading is marked with letter <H1? - <H5>


# 2. Make the heading in html code smaller
# Hint - H with higher values means smaller text


# 3. Find, if there are brackets in the text


# 4. If there are brackets, found the text inside


# 5. Find HTML tags used in text
#Hint: html tag is in the <>


# 6. #capitalize all first letters in each word


# 7. # change the second element to be a  to be a paragraph


# 8. Make all text lower case


# 9. Replace all s with capital S