In [ ]:
uglytext <- "Class of 2018: Senior Stories of Discovery, Learning and Serving\n\n\t\t\t\t\t\t\t"
See manual here.
In [ ]:
gsub("what to change", "what to replace", "what to change in this text")
In [ ]:
gsub("\t", "", uglytext)
In [ ]:
gsub("[^[:alpha:]]", " ", uglytext)
In [ ]:
## Loading example text data
load(url("https://cbail.github.io/Trump_Tweets.Rdata"))
head(trumptweets)
In [ ]:
library(tm)
trump_corpus <- Corpus(VectorSource( trumptweets$text) )
Other ways to use read your corpus:
Corpus( DirSource(directory = "directory") )
Corpus( VectorSource( dataframe$variable ) )
In [ ]:
library(tidytext)
library(dplyr)
In [ ]:
tidy_trump_tweets<- trumptweets %>%
select(created_at,text) %>%
unnest_tokens("word", text)
In [ ]:
head( tidy_trump_tweets )
In [ ]:
tidy_trump_tweets %>%
count(word) %>%
arrange(desc(n))
In [ ]:
data("stop_words")
tidy_trump_tweets<-tidy_trump_tweets %>%
anti_join(stop_words)
tidy_trump_tweets<-tidy_trump_tweets[-grep("\\b\\d+\\b", tidy_trump_tweets$word),]
tidy_trump_tweets$word <- gsub("\\s+","",tidy_trump_tweets$word)
## remove some extra nonsense
# tidy_trump_tweets$word <- gsub("https","",tidy_trump_tweets$word)
# tidy_trump_tweets$word <- gsub("rt "," ",tidy_trump_tweets$word)
library(SnowballC)
tidy_trump_tweets<-tidy_trump_tweets %>%
mutate_at("word", funs(wordStem((.), language="en")))
In [ ]:
tidy_trump_tweets %>%
count(word) %>%
arrange(desc(n))
In [ ]:
tidy_trump_DTM <-
tidy_trump_tweets %>%
count(created_at, word) %>%
cast_dtm(created_at, word, n)
In [ ]:
tidy_trump_DTM
In [ ]:
tidy_trump_tfidf<- trumptweets %>%
select(created_at,text) %>%
unnest_tokens("word", text) %>%
anti_join(stop_words) %>%
count(word, created_at) %>%
bind_tf_idf(word, created_at, n)
In [ ]:
top_tfidf<-tidy_trump_tfidf %>%
arrange(desc(tf_idf))
top_tfidf$word[1]
In [ ]: