In [ ]:
uglytext <- "Class of 2018: Senior Stories of Discovery, Learning and Serving\n\n\t\t\t\t\t\t\t"

Power of GREP

See manual here.


In [ ]:
gsub("what to change", "what to replace", "what to change in this text")

In [ ]:
gsub("\t", "", uglytext)

In [ ]:
gsub("[^[:alpha:]]", " ", uglytext)

Working with a corpus and document-term-matrix


In [ ]:
## Loading example text data

load(url("https://cbail.github.io/Trump_Tweets.Rdata"))
head(trumptweets)

In [ ]:
library(tm)
trump_corpus <- Corpus(VectorSource( trumptweets$text) )

Other ways to use read your corpus:

Corpus( DirSource(directory = "directory") )

Corpus( VectorSource( dataframe$variable ) )


In [ ]:
library(tidytext)
library(dplyr)

In [ ]:
tidy_trump_tweets<- trumptweets %>%
    select(created_at,text) %>%
    unnest_tokens("word", text)

In [ ]:
head( tidy_trump_tweets )

In [ ]:
tidy_trump_tweets %>%
  count(word) %>%
    arrange(desc(n))

In [ ]:
data("stop_words")
    tidy_trump_tweets<-tidy_trump_tweets %>%
      anti_join(stop_words)

tidy_trump_tweets<-tidy_trump_tweets[-grep("\\b\\d+\\b", tidy_trump_tweets$word),]
tidy_trump_tweets$word <- gsub("\\s+","",tidy_trump_tweets$word)


## remove some extra nonsense
# tidy_trump_tweets$word <- gsub("https","",tidy_trump_tweets$word)
# tidy_trump_tweets$word <- gsub("rt "," ",tidy_trump_tweets$word)

library(SnowballC)
  tidy_trump_tweets<-tidy_trump_tweets %>%
      mutate_at("word", funs(wordStem((.), language="en")))

In [ ]:
tidy_trump_tweets %>%
  count(word) %>%
    arrange(desc(n))

In [ ]:
tidy_trump_DTM <-
  tidy_trump_tweets %>%
  count(created_at, word) %>%
  cast_dtm(created_at, word, n)

In [ ]:
tidy_trump_DTM

In [ ]:
tidy_trump_tfidf<- trumptweets %>%
    select(created_at,text) %>%
      unnest_tokens("word", text) %>%
        anti_join(stop_words) %>%
           count(word, created_at) %>%
              bind_tf_idf(word, created_at, n)

In [ ]:
top_tfidf<-tidy_trump_tfidf %>%
  arrange(desc(tf_idf))

top_tfidf$word[1]

In [ ]: