5 min read

NBA Sparklines code

This code builds Edward Tufte-inspired sparkline graphs for the 2018 NBA season available. You can see the updated version of these graphs here.

# If you haven't installed these packages yet, do so now
library(tidyverse)
library(rvest)
library(ggthemes)
library(gridExtra)


# This file builds sparklines for each NBA team's record in the 2018 season

# Get Basketball Reference data
list.months <- tolower(month.name[c(1:4,10:12)]) #  Make a list of month names

# Loop through the monthly Basketball-Reference data
for(i in seq_along(list.months)){
  url <- paste0("https://www.basketball-reference.com/leagues/NBA_2018_games-",
                list.months[i],".html")
  d <- read_html(url) %>%
    html_table(header=T) %>%
    data.frame()
  d <- d[,c(1,3:6)] # Just keep the desired columns
  assign(paste(list.months[i]), d) #  rename the data.frame with its month name
  rm(d)
}
d <- rbind(october, november, december, january, february, march, april) #  combine all data.frames
rm(list=setdiff(ls(), "d")) # Remove everything except unified data.set, just of neatness sake

# Divisions
##  Rather than hand-entering, I grab the divisions data from Wikipedia
##    First, the western conference
west <- read_html("https://en.wikipedia.org/wiki/Western_Conference_(NBA)") %>%
  html_nodes(xpath = "/html/body/div[3]/div[3]/div[4]/div/table[4]") %>%
  html_table(header=T, fill=T) %>%
  data.frame()
west <- west[-1,] # There's an extra row at the top, delete it
west <- west[,c(1,2)] # Just keep the relevant columns

##    Repeat the above process for the eastern conference
east <- read_html("https://en.wikipedia.org/wiki/eastern_Conference_(NBA)") %>%
  html_nodes(xpath = "/html/body/div[3]/div[3]/div[4]/div/table[4]") %>%
  html_table(header=T, fill=T) %>%
  data.frame()
east <- east[-1,]
east <- east[,c(1,2)]
division <- rbind(west, east) # combine the eastern/western conferences

# The code below converts the Date into a variable R understands as a Date
d$Date <- str_sub(d$Date, 6) #  Remove the first 6 characters
d$Date <- str_replace(d$Date, ",", "") #  remove the comma
d$Date <- as.Date(d$Date, format = "%b %d %Y")

# Binary variable for if home team won or lost
d$visWIN[d$PTS < d$PTS.1] <- "loss"
d$visWIN[d$PTS > d$PTS.1] <- "win"

# Binary variable for if away team won or lost
d$homeWIN[d$PTS > d$PTS.1] <- "loss"
d$homeWIN[d$PTS < d$PTS.1] <- "win"

# Next we convert the data.frame to a format that has just 1 team and result per row
# The raw data has 1 game per row
home <- d[,c("Home.Neutral","Date","homeWIN")] #  Home teams
names(home) <- c("team","Date","result") #        Change names
visitor <- d[,c("Visitor.Neutral","Date","visWIN")] # Visitor teams
names(visitor) <- c("team","Date","result") #  change names

# Merge the two data.frames (they have the same names now)
d <- rbind(home, visitor)
d <- d[order(d$Date),] #  order the data.frame by Date

# Dummy variable for if a team won or not
d$windum <- 0
d$windum[d$result=="win"] <- 1
original <- d # Save a copy of the data.frame as "original"

# Calculate each team's record for every given gameday in the season
d <- original %>%
  group_by(team) %>%
  mutate(wins = cumsum(windum), #         "Cumulative sum" the "1" values for if the team won
         games = cumsum(!is.na(result))) #  CumSum every game
d$record.500 <- d$wins/d$games #            Calculate the % of games that were victories
d$record <- d$wins - (d$games - d$wins) #   Wins - losses

# This makes a text version of the numeric variable with a "+" sign in front of positive numbers
d$record.text <- d$record
d$record.text[d$record>0] <- paste0("+", d$record[d$record>0])
d$record[is.na(d$result)] <- NA # Make NA if date hasn't occured yet

# Join the record data to the division data
d <- left_join(d, division, by=c("team"="Team"))
d$team2 <- word(d$team, -1) # A shortened version of each team name

last.day <- last(d$Date[!is.na(d$result)]) #  the day of the most recent game

# For the graph labels, we want the data for just the most recent day
last.Date <- d[!is.na(d$record),] %>%
  group_by(team) %>%
  filter(row_number()==n()) # Get the last gameday (that has already been played)

# Make a plot for the Central division
# When values overlap in "geom_text" we must plot them separately
gg.central <- ggplot() +
  geom_line(data = d[d$Division=="Central",],
            aes(as.Date(Date), record, color=team), size=1) +
  geom_text(data = last.Date[last.Date$Division=="Central",],
            aes(as.Date(Date), record, label=paste(record.text, team2,
                                                   str_sub(round(record.500, digits = 3),
                                                           2)), color=team),
            hjust=0, nudge_x = 2) +
  ylim(min(d$record, na.rm = T), max(d$record, na.rm = T)) +
  scale_x_date(limits = c(as.Date("2017-10-17"),
                          as.Date("2018-05-10"))) +
  ggtitle("Central Division") +
  xlab(NULL) +
  theme_tufte() +
  theme(legend.position = "none")
print(gg.central)


gg.Pacific <- ggplot() +
  geom_line(data = d[d$Division=="Pacific",],
            aes(as.Date(Date), record, color=team), size=1) +
  geom_text(data = last.Date[last.Date$Division=="Pacific",],
            aes(as.Date(Date), record, label=paste(record.text, team2,
                                                   str_sub(round(record.500, digits = 3),
                                                           2)), color=team),
            hjust=0, nudge_x = 2) +
  ylim(min(d$record, na.rm = T), max(d$record, na.rm = T)) +
  scale_x_date(limits = c(as.Date("2017-10-17"),
                          as.Date("2018-05-10"))) +
  ggtitle("Pacific Division") +
  xlab(NULL) +
  theme_tufte() +
  theme(legend.position = "none")
print(gg.Pacific)

gg.Atlantic <- ggplot() +
  geom_line(data = d[d$Division=="Atlantic",],
            aes(as.Date(Date), record, color=team), size=1) +
  geom_text(data = last.Date[last.Date$Division=="Atlantic",],
            aes(as.Date(Date), record, label=paste(record.text, team2,
                                                   str_sub(round(record.500, digits = 3),
                                                           2)), color=team),
            hjust=0, nudge_x = 2) +
  ylim(min(d$record, na.rm = T), max(d$record, na.rm = T)) +
  scale_x_date(limits = c(as.Date("2017-10-17"),
                          as.Date("2018-05-10"))) +
  ggtitle("Atlantic Division") +
  xlab(NULL) +
  theme_tufte() +
  theme(legend.position = "none")
print(gg.Atlantic)


gg.Southeast <- ggplot() +
  geom_line(data = d[d$Division=="Southeast",],
            aes(as.Date(Date), record, color=team), size=1) +
  geom_text(data = last.Date[last.Date$Division=="Southeast",],
            aes(as.Date(Date), record, label=paste(record.text, team2,
                                                   str_sub(round(record.500, digits = 3),
                                                           2)), color=team),
            hjust=0, nudge_x = 2) +
  ylim(min(d$record, na.rm = T), max(d$record, na.rm = T)) +
  scale_x_date(limits = c(as.Date("2017-10-17"),
                          as.Date("2018-05-10"))) +
  ggtitle("Southeast Division") +
  xlab(NULL) +
  theme_tufte() +
  theme(legend.position = "none")
print(gg.Southeast)



gg.Southwest <- ggplot() +
  geom_line(data = d[d$Division=="Southwest",],
            aes(as.Date(Date), record, color=team), size=1) +
  geom_text(data = last.Date[last.Date$Division=="Southwest",],
            aes(as.Date(Date), record, label=paste(record.text, team2,
                                                   str_sub(round(record.500, digits = 3),
                                                           2)), color=team),
            hjust=0, nudge_x = 2) +
  ylim(min(d$record, na.rm = T), max(d$record, na.rm = T)) +
  scale_x_date(limits = c(as.Date("2017-10-17"),
                          as.Date("2018-05-10"))) +
  ggtitle("Southwest Division") +
  xlab(NULL) +
  theme_tufte() +
  theme(legend.position = "none")
print(gg.Southwest)


gg.Northwest <- ggplot() +
  geom_line(data = d[d$Division=="Northwest",],
            aes(as.Date(Date), record, color=team), size=1) +
  geom_text(data = last.Date[last.Date$Division=="Northwest",],
            aes(as.Date(Date), record, label=paste(record.text, team2,
                                                   str_sub(round(record.500, digits = 3),
                                                           2)), color=team),
            hjust=0, nudge_x = 2) +
  ylim(min(d$record, na.rm = T), max(d$record, na.rm = T)) +
  scale_x_date(limits = c(as.Date("2017-10-17"),
                                as.Date("2018-05-10"))) +
  ggtitle("Northwest Division") +
  xlab(NULL) +
  theme_tufte() +
  theme(legend.position = "none")
print(gg.Northwest)