Redis Stats in R May 12th, 2012 # aggregate stats from redis from our tweet research # util: pretty-print bignums w/commas for readability pp <- function(x){ format( x, big.mark=",", scientific=FALSE) } # open up access to redis data library(rredis) redisConnect() # core sets + zsets sets <- c('tweets:hashtags', 'tweets:links', 'tweets:mentions', 'user:is_public') zsets <- c('words', 'user:followers', 'user:num_tweets') # 8 language sets + 120 country sets langs <- c("DE", "EN", "ES", "FA", "FR", "NL", "PT", "RU") countries <- c( "AE", "AF", "AG", "AM", "AO", "AQ", "AR", "AT", "AU", "AZ", "BA", "BB", "BD", "BE", "BH", "BN", "BR", "BS", "BW", "BY", "CA", "CH", "CL", "CN", "CO", "CR", "CU", "CY", "DE", "DK", "DO", "DZ", "EC", "EE", "EG", "ES", "ET", "FI", "FJ", "FK", "FR", "GB", "GE", "GH", "GI", "GL", "GR", "GT", "GU", "HK", "HN", "HR", "HU", "ID", "IE", "IL", "IN", "IR", "IT", "JM", "JO", "JP", "KE", "KH", "KP", "KR", "KW", "LB", "LK", "LT", "LU", "LV", "MA", "MC", "MK", "MT", "MU", "MW", "MX", "MY", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA", "PE", "PH", "PK", "PL", "PT", "PY", "QA", "RO", "RS", "RU", "RW", "SA", "SE", "SG", "SI", "SN", "SV", "TH", "TR", "TT", "TW", "TZ", "UA", "UG", "US", "UY", "VA", "VE", "VI", "VN", "XK", "ZA", "ZW") # walk the list of key names and pretty-print stats for each set for (i in 1:length(sets)) { print( paste(sets[i], ":", pp( redisSCard(sets[i])))) } # ...and zset for (i in 1:length(zsets)) { print( paste(zsets[i], ":", pp( redisZCard(zsets[i])))) } # emit basic cardinality for all languages... lang_stats <- c(1:length(langs)) for (i in 1:length(langs)) { key <- paste("user:lang:",langs[i],sep="") card <- redisSCard(key) lang_stats[i] <- card print( paste(key, pp(card))) } lang_stats <- data.frame(langs,lang_stats) names(lang_stats) <- c("tweet language","occurrences") # ...and countries country_stats <- c(1:length(countries)) for (i in 1:length(countries)) { key <- paste("user:country:",countries[i],sep="") card <- redisSCard(key) country_stats[i] <- card print( paste(key, pp( card))) } country_stats <- data.frame(countries,country_stats) names(country_stats) <- c("tweet country", "occurrences") # clean up the workspace rm(i,card,key) # after the run, stats accrue in 2 data.frames: lang_stats + country_stats # "tweets:hashtags : 458,640 # "tweets:links : 270,319 # "tweets:mentions : 1,086,466 # "user:is_public : 1,812,923 # # "words : 503,999 # "user:followers : 1,711,305 # "user:num_tweets : 1,207,538 # # "user:lang:DE 9,369 # "user:lang:EN 1,622,940 # "user:lang:ES 62,800 # "user:lang:FA 932 # "user:lang:FR 166,233 # "user:lang:NL 3,361 # "user:lang:PT 5,109 # "user:lang:RU 124,741