######## Intro ######## ### 2 vektory area <- c(1:12) rate <- c(1.2, 1.5, 1.02, 1.63, 1.14, 1.86, 2.1, 1.43, 1.76, 2.08, 1.32, 1.6) ### vypsat promenne ls() ### zobrazit area ### matice mat <- cbind(area, rate) ### zobrazit 2 radek matice mat[2, ] ### vykreslit matici plot(mat) ### dalsi vektor area <- c(1:12) ### vytvoreni sekvenci rate <- seq(length=12, from=1, by=.1) mat2 <- cbind(area,rate) plot(mat) ### okno se nepremaze par(new=TRUE) plot(mat2) ### rozlisit a mit stejnou stupnici plot(mat, main="pokusne matice", ylim=c(1, 2.2));par(new=TRUE);plot(mat2, ylim=c(1, 2.2), pch=2) ### pridame popisky legend(x="topleft", legend=c("mat", "mat2"), pch=c(1,2), bg="white", inset=0.01) ### pridame "ohraniceni" abline(v=c(6, 8), lty=4) ### text text(7,1.2,"spatna oblast",srt=90) ######### tm ########## ### Pavel Brazdil, LIAAD, FEP Universidade do Porto ### library(tm) ### data volne dostupna na internetu - 20newsgroups ### http://people.csail.mit.edu/jrennie/20Newsgroups/ electr.train <- Corpus(DirSource("20news-bydate-train/sci.electronics"), readerControl=list(reader=readPlain, language="en_US")) print(electr.train) summary(electr.train) inspect(electr.train[1:3]) ### dalsi 3 korpusy religion.train <- Corpus(DirSource("20news-bydate-train/talk.religion.misc"), readerControl=list(reader=readPlain, language="en_US")) electr.test <- Corpus(DirSource("20news-bydate-test/sci.electronics"), readerControl=list(reader=readPlain, language="en_US")) religion.test <- Corpus(DirSource("20news-bydate-test/talk.religion.misc"), readerControl=list(reader=readPlain, language="en_US")) # indexy jednotlivych dokumentu # electr.train – documents 1 .. 591 # religion.train – documents 592 .. 968 (377 docs) # electr.test – documents 969 .. 1361 (393 docs) # religion.test – documents 1362 .. 1612 (251 docs) all <- c(electr.train, religion.train, electr.test, religion.test) ### predzpracovani all <- tm_map(all, PlainTextDocument) all <- tm_map(all, removeWords, stopwords(language="english")) all <- tm_map(all, stripWhitespace) all <- tm_map(all, tolower) all <- tm_map(all, removePunctuation) all <- tm_map(all, removeNumbers) ### stemming - velmi dlouhe... ### library(rJava); library(rWeka); library(Snowball) ### all <- tm_map(all, stemDocument) DocumentTermMatrix(all) dtm.all <- DocumentTermMatrix( all, control=list(minWordLength=2, minDocFreq=5)) findFreqTerms(dtm.all, 40) dtm.all.frame <- as.data.frame(inspect( dtm.all)) class <- c(rep("sci",591), rep("rel",377), rep("sci",393), rep("rel",251)) dtm.all.frame <- cbind(dtm.all.frame, class) ## pripravime trenovaci data train <- dtm.all.frame[1:968, 1:ncol( dtm.all.frame )] ### testovaci test <- dtm.all.frame[969:1612, 1:ncol( dtm.all.frame ) - 1] testclass <- dtm.all.frame[969:1612,ncol(dtm.all.frame)] ### rozhodovaci strom library(rpart) dt <- rpart(class ~ cable + circuit + ground + neutral + outlets + subject + wire + wiring + judas + ra + christ + elohim + father + god + gods + jehovah + jesus + lord + mcconkie + ps + son + unto, train) ### predikce dt.predictions <- predict(dt, test, type="class") table(testclass, dt.predictions)