library(ggplot2)
library(plyr)
# Load complete dataset
bnames <- read.csv(bzfile("baby-names-by-state.csv.bz2"),
stringsAsFactors = F)
bnames <- subset(bnames, !is.na(number))
bnames$state <- factor(bnames$state)
# Extract only names that have appeared in at least 10% of possible years and
# states
bnames$namesex <- paste(bnames$name, bnames$sex, sep = "-")
counts <- ddply(bnames, c("namesex"), nrow)
counts <- counts[order(-counts$V1), ]
counts <- subset(counts, V1 > 2450 * 0.10)
top <- subset(bnames, namesex %in% counts$namesex)
# Look for names where there is a lot of variation in pattern between states
patterns <- dlply(top, c("namesex"), function(df) {
lm(log(number) ~ factor(year) + state, data = df)
}, .progress = "text")
rsq <- function(mod) c(rsq = summary(mod)$r.squared)
qual <- arrange(ldply(patterns, rsq), -rsq)
worst <- subset(top, namesex %in% subset(qual, rsq < 0.93)$namesex)
worst <- ddply(worst, "namesex", transform,
adj = 10 ^ resid(lm(log10(number) ~ state)))
qplot(year, adj, data = worst, geom = "line", group = state) +
facet_wrap(~ namesex)
qplot(year, adj, data = subset(worst, namesex == "Kayla-girl"), geom = "line", group = state)