Processes the holdings files to produce the final dataset

library(knitr)
setwd("/home/rburke/oboc/src/")
read_chunk("holdings.R")

Load the project constants

# Load project constants
setwd("/home/rburke/oboc/src/")
source("params.R")
# Holdings

Load the libraries

library(plyr)

# --- LOAD_TRANS ----
# Loading function
# i: Index into the BOOK_PATHS data frame (see params.R)
load_holdings <- function(book) {
  file <- BOOK_PATHS[BOOK_PATHS$Book==book,2]
  path.in <- paste(RCR_DATA, file, "holdings.txt", sep="")
  df <- read.csv(path.in, sep="|", stringsAsFactors = F)
  return (df)
}

Defining functions for managing the data

Load the holdings data

Create a canonical data frame for holdings data. This is necessary because the transaction files do not use the same fields.

# Canonicalize holdings
# Keep only Branch info.
# Note that the files are not all saved with the same naming convention
canonicalize_holdings <- function(book, df) {
  if (book %in% c("AM", "TC")) {
    df.canon <- data.frame(Book=book, Code=df[,c("KBRA")], stringsAsFactors=FALSE)
  } else if (book %in% c("GB", "BT", "WS", "KC")) {
    df.canon <- data.frame(Book=book, Code=df[,c("BRANCH")], stringsAsFactors=FALSE)
  } else {
    stop(paste("Unknown book: ", book, sep=""))
  }
  return (df.canon)
}

Filter the branches using the BRANCHES_IGNORABLE data

# Branch filter
# Fix the branch data for the North Austin problem
# Remove the branches that we're not using
branch_filter <- function (df) {
  if (any(is.na(df$Code))) {
    df[is.na(df$Code),]$Code <- "NoA"
  }
  
  df <- df[!(df$Code %in% BRANCHES_IGNORABLE),]
  
  return (df)
}

The main function

Loads and process the transaction data.

# Processes all the holdings into a single data frame
event_holdings <- function() {
  hold.files <- list()
  
  for (book in BOOK_PATHS[,1]) {
    df <- load_holdings(book)
    df.canon <- canonicalize_holdings(book, df)
    df.branch <- branch_filter(df.canon)
    
    hold.files[[book]] <- df.branch
  }
  
  # Assemble all the listed data frames into one. 
  holdsdf <- ldply(hold.files, .id="Book")
  
  # Convert the Book and Code data to factors
  holdsdf$Book <- factor(holdsdf$Book)
  holdsdf$Code <- factor(holdsdf$Code)
  
  return (holdsdf)
}

Creating the total transactions per branch table

Get all the transactions.

holdsdf <- event_holdings()

Make table and convert back to data frame

holds.tbl <- xtabs(~ Book + Code, data=holdsdf)
holds.tbl.df <- as.data.frame(holds.tbl)
holds.tbl.df <- rename(holds.tbl.df, replace=c("Freq"="Holds"))

Save the data frame

path.out <- paste(RCR_DATA, "holdings-", 
                  DATA_VERSION, ".csv", sep="")
write.csv(holds.tbl.df, path.out, row.names=FALSE)