poissonisfish

Pingback: Twitter data analysis in R – Technology Revolution

Pingback: Twitter data analysis in R – Data Science Austria

2019-10-10T23:50:20+00:00

Great my friend!!!

LikeLike

Reply

2019-10-11T00:23:39+00:00

Amazing post! Ever so detailed and peppered with subtle humor 🙂

LikeLike

Reply

Pingback: Twitter data analysis in R | R-bloggers

Pingback: Twitter data analysis in R – Technology Revolution

Pingback: Audio classification in R – poissonisfish

Pingback: Object detection and tracking in Python – poissonisfish

	#!/Library/Frameworks/R.framework/Resources/Rscript
	# Mon Apr 15 18:41:47 2019 ------------------------------
	library(rtweet)

	# Twitter API
	create_token(app = "INSERT_HERE",
	consumer_key = "INSERT_HERE",
	consumer_secret = "INSERT_HERE",
	access_token = "INSERT_HERE",
	access_secret = "INSERT_HERE")

	# Read GOT tweets from US
	newTweets <- search_tweets(q = "game of thrones",
	retryonratelimit = T, lang = "en",
	geocode = lookup_coords("usa", apikey = apiKey),
	include_rts = FALSE, n = 1e5) # 1st day 3e5, to go back ~1 week

	# Specify dir
	dirPath <- "~/Documents/INSERT_PATH"

	# Create dir for storage
	if(!dir.exists(paste0(dirPath, "tweets/"))){
	dir.create("tweets/")
	}

	# Write csv with date
	save_as_csv(newTweets, paste0(dirPath, "tweets/", Sys.Date(), ".csv"),
	prepend_ids = TRUE, na = "",
	fileEncoding = "UTF-8")

	# Wed May 8 21:22:45 2019 ------------------------------
	# Use status_id to identify and exclude duplicates
	library(rtweet)

	# List all files
	allFiles <- paste0("tweets/", list.files("tweets/"))

	# Write function to merge tweets
	mergeTweets <- function(recipient, donor){
	idx <- !donor$status_id %in% recipient$status_id
	return(do_call_rbind(list(recipient, donor[idx, ])))
	}

	for(i in allFiles){
	if(i == allFiles[1]){
	allTweets <- read_twitter_csv(file = i,
	unflatten = T)
	}else{
	tmp <- read_twitter_csv(file = i,
	unflatten = T)
	allTweets <- mergeTweets(allTweets, tmp)
	}
	}

	# Write CSV
	write_as_csv(allTweets, file_name = "gotTwitter.csv")

	# Load libraries
	library(tidyverse)
	library(reshape2)
	library(ggplot2)
	library(ggridges)
	library(lubridate)
	library(rtweet)
	library(maps)
	library(quanteda)

	# Read final dataset
	allTweets <- read_twitter_csv("../input/gotTwitter.csv", unflatten = T)

	# Convert UTC to EDT
	allTweets %<>% dplyr::mutate(created_at = as_datetime(created_at, tz = "UTC")) %>%
	dplyr::mutate(created_at = with_tz(created_at, tzone = "America/New_York"))

	# Produce lat and lng coordinates
	allTweets <- lat_lng(allTweets)
	# Plot
	par(mar = rep(12, 4))
	map("state", lwd = .25)
	# plot lat and lng points onto state map
	with(allTweets, points(lng, lat,
	pch = 16, cex = .25,
	col = rgb(.8, .2, 0, .2)))

poissonisfish

Twitter data analysis in R

Introduction

Kaggle

The cron scheduler

Sharing is caring

Let’s get started with R

Harvest

Twitter API

Google Maps API (optional)

Tweet search

Cron job setup

Processing

Get-Started analysis

Since you are here

Dealing with Twitter bots

Wordclouds

Sentiment analysis

Wrap-up

Citation

8 thoughts on “Twitter data analysis in R”

Leave a comment Cancel reply

	# Google Maps API https://developers.google.com/maps/documentation/javascript/get-api-key
	apiKey <- "INSERT_HERE"

	# Tokenize words
	tkn <- tokens(allTweets$text,
	remove_twitter = T,
	remove_separators = T,
	remove_symbols = T,
	remove_punct = T,
	remove_url = T,
	remove_hyphens = T,
	remove_numbers = T) %>%
	tokens_ngrams(n = 1:2)

	gotDfm <- dfm(tkn, tolower = T,
	remove = stopwords("english"))

	gotChars <- c("jon", "cersei", "sansa", "arya",
	"bran", "tyrion", "jaime", "daenerys",
	"hound", "davos", "missandei", "theon",
	"brienne", "gendry", "grey_worm", "jorah",
	"night_king", "varys", "melisandre", "tormund")

	gotFcm <- dfm_select(gotDfm, pattern = gotChars) %>%
	fcm()

	set.seed(100)
	textplot_network(gotFcm, min_freq = 0.1,
	edge_alpha = .25,
	edge_size = 5)

	# Identify tweets containing any of the characters names (0/1)
	popularity <- as.data.frame(lapply(gotChars, function(x){
	as.integer(sapply(tkn, function(k){any(k %in% x)}))
	}))

	# Write colnames
	colnames(popularity) <- gotChars

	# Add column with corresponding EST time
	popularity$created_at <- allTweets$created_at

	# Reshape w.r.t. created_at, select hits
	popularity <- reshape2::melt(popularity, id.vars = "created_at")
	popularity <- slice(popularity, which(value == 1))

	# Determine the time all six episodes were aired (9pm EST every Sunday starting 14th April)
	epAirTime <- ymd_hms("2019-04-14 21:00:00",tz="EST") + dweeks(0:5)

	# Plot ggridge-style
	ggplot(popularity, aes(x = created_at, y = variable, fill = variable)) +
	geom_density_ridges() +
	geom_vline(xintercept = epAirTime, linetype = "dashed",
	color = "red",show.legend = T) +
	theme_ridges() +
	theme(legend.position = "none") +
	annotate("text", x = epAirTime, y = 20.75,
	label = paste0("Ep.", c(1:6)) ,hjust = 1.25)

	# Sat Oct 5 10:06:01 2019 ------------------------------
	# Bonus - rm bots, time-dependend wordclouds & sentiment analysis
	rtStats <- do.call("rbind", by(allTweets, INDICES = allTweets$screen_name, function(x){
	return(data.frame(num_tweets = nrow(x),
	mean_followers = mean(x$followers_count),
	median_rt = median(x$retweet_count)))
	}))

	# Plot log10(num_tweets) vs. log10(median_rt)
	with(log10(rtStats+1), plot(num_tweets, median_rt,
	cex = mean_followers / max(mean_followers),
	pch = 16,
	col = rgb(0,0,0,.25),
	xlab = expression(paste(log[10], " # tweets + 1")),
	ylab = expression(paste(log[10], " median # rts + 1"))))
	nums <- c(1e2, 1e4, 1e6, 1e8)
	legend("topright", title = "# followers",
	pch = 16, col = rgb(0,0,0,.25),
	pt.cex = log10(nums + 1)/max(log10(rtStats$mean_followers + 1)),
	legend = formatC(nums, format = "e", digits = 1),
	bty = "n")

	# Wordcloud
	# Remove potential bots w/ > 100 tweets in the dataset
	bots <- rownames(rtStats)[which(rtStats$num_tweets > 100)]
	reducedTweet <- allTweets[!allTweets$screen_name %in% bots,]
	reducedTweet$text <- texts(reducedTweet$text) %>%
	iconv(from = "UTF-8", to = "ASCII", sub = "") %>%
	gsub(pattern = "<[A-Z+0-9]+>", repl = " ")

	# Tokenize words
	tkn <- tokens(reducedTweet$text,
	remove_twitter = T,
	remove_separators = T,
	remove_symbols = T,
	remove_punct = T,
	remove_url = T,
	remove_hyphens = T,
	remove_numbers = T)

	# Remove stopwords and stem words
	gotDfm <- dfm(tkn, tolower = T,
	remove = stopwords("en"),
	stem = T)

	# Remove irrelevant terms incl. single-character words
	badWords <- c("game", "throne", "gameofthron", "got",
	"watch", "episod", "season", "show",
	"just", "like")
	gotDfm <- gotDfm[,nchar(colnames(gotDfm)) > 1 &
	!colnames(gotDfm) %in% badWords]

	epAirTime <- ymd_hms("2019-04-14 21:00:00", tz = "EST") + dweeks(0:5)
	wcLists <- lapply(1:6, function(x){
	idx <- tweetReduced$created_at > epAirTime[x] + dhours(2) &
	tweetReduced$created_at < epAirTime[x] + ddays(4)
	return(gotDfm[idx,])
	})

	par(mar = rep(0, 4))
	for(i in 1:length(wcLists)){
	set.seed(100)
	textplot_wordcloud(wcLists[[i]],
	max_words = 100)
	}

	# Sentiment analysis
	tknDct <- tokens_lookup(tkn, dictionary = data_dictionary_LSD2015)
	saDfm <- dfm(tknDct,
	remove = stopwords("en"),
	stem = T)

	summ <- do.call("rbind", by(convert(saDfm, to="data.frame")[,-1],
	INDICES = date(tweetReduced$created_at),
	FUN = colSums))

	dev.off() # reset past graphical pars
	plot(date(rownames(summ)),
	(summ[,2] - summ[,1]) / rowSums(summ[,1:2]),
	type = "l", xlab = "Date", ylab = "Sentiment score")
	abline(h = 0)
	abline(v = date(epAirTime), lty = 2, col = rgb(1,0,0,.5))
	text(date(epAirTime) - 3, .095, labels = paste0("Ep.", c(1:6)))

Introduction

Kaggle

The cron scheduler

Sharing is caring

Let’s get started with R

Harvest

Twitter API

Google Maps API (optional)

Tweet search

Cron job setup

Processing

Get-Started analysis

Since you are here

Dealing with Twitter bots

Wordclouds

Sentiment analysis

Wrap-up

Citation

Share this:

8 thoughts on “Twitter data analysis in R”

Leave a comment Cancel reply