Tag Archives: KDnuggets

The United States In Two Words

Sweet home Alabama, Where the skies are so blue; Sweet home Alabama, Lord, I’m coming home to you (Sweet home Alabama, Lynyrd Skynyrd)

This is the second post I write to show the abilities of twitteR package and also the second post I write for KDnuggets. In this case my goal is to have an insight of what people tweet about american states. To do this, I look for tweets containing the exact phrase “[STATE NAME] is” for every states. Once I have the set of tweets for each state I do some simple text mining: cleaning, standardizing, removing empty words and crossing with these sentiment lexicons. Then I choose the two most common words to describe each state. You can read the original post here. This is the visualization I produced to show the result of the algorithm:

States In Two Words v2

Since the right side of the map is a little bit messy, in the original post you can see a table with the couple of words describing each state. This is just an experiment to show how to use and combine some interesting tools of R. If you don’t like what Twitter says about your state, don’t take it too seriously.

This is the code I wrote for this experiment:

# Do this if you have not registered your R app in Twitter
library(twitteR)
library(RCurl)
setwd("YOUR-WORKING-DIRECTORY-HERE")
if (!file.exists('cacert.perm'))
{
  download.file(url = 'http://curl.haxx.se/ca/cacert.pem', destfile='cacert.perm')
}
requestURL="https://api.twitter.com/oauth/request_token"
accessURL="https://api.twitter.com/oauth/access_token"
authURL="https://api.twitter.com/oauth/authorize"
consumerKey = "YOUR-CONSUMER_KEY-HERE"
consumerSecret = "YOUR-CONSUMER-SECRET-HERE"
Cred <- OAuthFactory$new(consumerKey=consumerKey,
                         consumerSecret=consumerSecret,
                         requestURL=requestURL,
                         accessURL=accessURL,
                         authURL=authURL)
Cred$handshake(cainfo=system.file("CurlSSL", "cacert.pem", package="RCurl"))
save(Cred, file="twitter authentification.Rdata")
# Start here if you have already your twitter authentification.Rdata file
library(twitteR)
library(RCurl)
library(XML)
load("twitter authentification.Rdata")
registerTwitterOAuth(Cred)
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
#Read state names from wikipedia
webpage=getURL("http://simple.wikipedia.org/wiki/List_of_U.S._states")
table=readHTMLTable(webpage, which=1)
table=table[!(table$"State name" %in% c("Alaska", "Hawaii")), ]
#Extract tweets for each state
results=data.frame()
for (i in 1:nrow(table))
{
  tweets=searchTwitter(searchString=paste("'\"", table$"State name"[i], " is\"'",sep=""), n=200, lang="en")
  tweets.df=twListToDF(tweets)
  results=rbind(cbind(table$"State name"[i], tweets.df), results)
}
results=results[,c(1,2)]
colnames(results)=c("State", "Text")
library(tm)
#Lexicons
pos = scan('positive-words.txt',  what='character', comment.char=';')
neg = scan('negative-words.txt',  what='character', comment.char=';')
posneg=c(pos,neg)
results$Text=tolower(results$Text)
results$Text=gsub("[[:punct:]]", " ", results$Text)
# Extract most important words for each state
words=data.frame(Abbreviation=character(0), State=character(0), word1=character(0), word2=character(0), word3=character(0), word4=character(0))
for (i in 1:nrow(table))
{
  doc=subset(results, State==as.character(table$"State name"[i]))
  doc.vec=VectorSource(doc[,2])
  doc.corpus=Corpus(doc.vec)
  stopwords=c(stopwords("english"), tolower(unlist(strsplit(as.character(table$"State name"), " "))), "like")
  doc.corpus=tm_map(doc.corpus, removeWords, stopwords)
  TDM=TermDocumentMatrix(doc.corpus)
  TDM=TDM[Reduce(intersect, list(rownames(TDM),posneg)),]
  v=sort(rowSums(as.matrix(TDM)), decreasing=TRUE)
  words=rbind(words, data.frame(Abbreviation=as.character(table$"Abbreviation"[i]), State=as.character(table$"State name"[i]),
                                   word1=attr(head(v, 4),"names")[1],
                                   word2=attr(head(v, 4),"names")[2],
                                   word3=attr(head(v, 4),"names")[3],
                                   word4=attr(head(v, 4),"names")[4]))
}
# Visualization
require("sqldf")
statecoords=as.data.frame(cbind(x=state.center$x, y=state.center$y, abb=state.abb))
#To make names of right side readable
texts=sqldf("SELECT a.abb,
            CASE WHEN a.abb IN ('DE', 'NJ', 'RI', 'NH') THEN a.x+1.7
            WHEN a.abb IN ('CT', 'MA') THEN a.x-0.5  ELSE a.x END as x,
            CASE WHEN a.abb IN ('CT', 'VA', 'NY') THEN a.y-0.4 ELSE a.y END as y,
            b.word1, b.word2 FROM statecoords a INNER JOIN words b ON a.abb=b.Abbreviation")
texts$col=rgb(sample(0:150, nrow(texts)),sample(0:150, nrow(texts)),sample(0:150, nrow(texts)),max=255)
library(maps)
jpeg(filename = "States In Two Words v2.jpeg", width = 1200, height = 600, quality = 100)
map("state", interior = FALSE, col="gray40", fill=FALSE)
map("state", boundary = FALSE, col="gray", add = TRUE)
text(x=as.numeric(as.character(texts$x)), y=as.numeric(as.character(texts$y)), apply(texts[,4:5] , 1 , paste , collapse = "\n" ), cex=1, family="Humor Sans", col=texts$col)
dev.off()
Advertisements

Simple Data Science Of Global Warming In KDnuggets

Would love to get a post from you for KDnuggets (Gregory Piatetsky, KDnuggets President)

logoSome days ago, Gregory Piatetsky invited me to write a post for KDnuggets. I couldn’t say no. He suggested to me some topics and I decided to experiment around climate change to demonstrate how easy is to see some evidences of this alarming menace. You can read the post here.

This is the code I wrote to do this experiment:

require(sqldf)
require(googleVis)
require(ggplot2)
require(ggthemes)
setwd("YOUR WORKING DIRECTORY HERE") #This line doen's work until you type a valid path
#Data Avaliable in http://eca.knmi.nl/utils/downloadfile.php?file=download/ECA_blend_tg.zip
files=list.files(pattern = "TG_STAID")
results=data.frame(staid=character(0), trnd=numeric(0), nobs=numeric(0))
#Loop to calculate linear trends
for (i in 1:length(files))
{
  table=read.table(files[i], header=TRUE, sep = ',', skip=20)
  table=table[table$Q_TG==0,]
  table$date=as.Date(as.character(table$DATE), "%Y%m%d")
  results=rbind(data.frame(staid=files[i], trnd=lm.fit (x = matrix(table$date), y = table$TG)$coefficients, nobs=nrow(table)), results)
}
write.csv(results, file="results.csv", row.names = FALSE)#Save your work
results=read.csv(file="results.csv")#Read your work. You can start here if you stop your work further this line
#Remove outliers
results=results[!results$trnd %in% boxplot.stats(results$trnd, coef = 4)$out,]
#Histogram
hist(x=results$trnd, breaks = 50,
     col = "orange",
     border = "pink",
     freq=TRUE,
     xlim=c(-0.03, 0.03),
     ylim=c(0, 300),
     xlab="Historical trend of daily mean temperatures",
     ylab="Number of stations",
     main="Evolution of temperatures in Europe and the Mediterranean",
     sub="Source: European Climate Assessment & Dataset project")
results$staid2=as.numeric(gsub("[^0-9]","",results$staid)) #To join results with geographical coordinates
#Read table of geographical coordinates
staids=read.table("http://www.ecad.eu/download/ECA_all_stations.txt", header=TRUE, sep = ',', skip=17)
#Right tail of the distribution
hotpoints=sqldf("SELECT a.staid, a.staid2, a.trnd, a.nobs, b.staname, b.lat, b.lon
      FROM results a INNER JOIN staids b ON (a.staid2=b.staid) WHERE TRND >= 0.02")
#Convert degrees:minutes:seconds to decimal coordinates
hotpoints=within(hotpoints, {
  dms=do.call(rbind, strsplit(as.character(LAT), ":"))
  lat=sign(as.numeric(dms[,1]))*(abs(as.numeric(dms[,1]))+(as.numeric(dms[,2]) + as.numeric(dms[,3])/60)/60);rm(dms)
})
hotpoints=within(hotpoints, {
  dms=do.call(rbind, strsplit(as.character(LON), ":"))
  lon=sign(as.numeric(dms[,1]))*(abs(as.numeric(dms[,1]))+(as.numeric(dms[,2]) + as.numeric(dms[,3])/60)/60);rm(dms)
})
#To make readable tha name of the station in the map
hotpoints$staname=gsub("^\\s+|\\s+$", "", hotpoints$STANAME)
#To prepare coordinates to gvis function
hotpoints$LatLong=with(hotpoints, paste(lat, lon, sep=":"))
#The amazing gvisMap function of googleVis package
hotpointsMap=gvisMap(hotpoints, "LatLong" , "STANAME",
        options=list(showTip=TRUE,
                     showLine=TRUE,
                     enableScrollWheel=TRUE,
                     mapType='terrain',
                     useMapTypeControl=TRUE))
plot(hotpointsMap)
#The plot one of this hot stations
InAmenas=read.table("TG_STAID000312.txt", header=TRUE, sep = ',', skip=20)
InAmenas=InAmenas[InAmenas$Q_TG==0,]
InAmenas$date=as.Date(as.character(InAmenas$DATE), "%Y%m%d")
ggplot(InAmenas, aes(date, TG)) +
  geom_line(color="red", alpha=0.8) +
  xlab("") +
  ylab("Mean temperature in 0.1C")+
  ggtitle("Mean temperature in IN-AMENAS (ALGERIA) 1958- 1998")+
  geom_smooth(method = "lm", se=FALSE, color="red", lwd=2)+
  theme_economist(base_size = 20, base_family = "sans", horizontal = TRUE,
                  dkpanel = FALSE, stata = FALSE)