Processes the circulation files to produce the final datasets

library(knitr)
setwd("/home/rburke/oboc/src/")
read_chunk("circulation.R")

Load the project constants

setwd("/home/rburke/oboc/src/")
source("params.R")

Load the libraries

library(plyr)

# --- LOAD_TRANS ----
# Loading function
# i: Index into the BOOK_PATHS data frame (see params.R)
load_transactions <- function(book) {
  file <- BOOK_PATHS[BOOK_PATHS$Book==book,2]
  path.in <- paste(RCR_DATA, file, "transactions.txt", sep="")
  df <- read.csv(path.in, sep="|", stringsAsFactors = F)
  return (df)
}

Defining functions for managing the data

Load the transaction data

Create a canonical data frame for circulation data. This is necessary because the transaction files do not use the same fields.

# Canonicalize transactions
# Keep only TIMESTAMP, ENVBRANCH, and TRANSACTIONTYPE columns.
# Rename these to "Date", "Code", and "Type"
# Note that the files are not all saved with the same naming convention
canonicalize_transactions <- function(book, df) {
  if (book %in% c("AM", "GB", "TC")) {
    df.canon <- df[,c("TIMESTAMP", "ENVBRANCH", "TRANSACTIONTYPE")]
    df.canon <- rename(df.canon, replace=c("TIMESTAMP"="Date",
                                           "ENVBRANCH"="Code",
                                           "TRANSACTIONTYPE"="Type"))
    # These books have slightly different naming conventions
  } else if (book %in% c("BT", "WS", "KC")) {
    df.canon <- df[,c("TIME_STAMP", "BRANCH", "TYPE")]
    df.canon <- rename(df.canon, replace=c("TIME_STAMP"="Date",
                                           "BRANCH"="Code",
                                           "TYPE"="Type"))
  } else {
    stop(paste("Unknown book: ", book, sep=""))
  }
  return (df.canon)
}

Filter out the transactions other than checkouts and holds.

# Transaction filter
# Remove the transactions that are among the allowed types
transaction_filter <- function (df, types) {
  df <- df[(df$Type %in% types),]
  
  return (df)
}

Filter the branches using the BRANCHES_IGNORABLE data

# Branch filter
# Fix the branch data for the North Austin problem
# Remove the branches that we're not using
branch_filter <- function (df) {
  if (any(is.na(df$Code))) {
    df[is.na(df$Code),]$Code <- "NoA"
  }

  df <- df[!(df$Code %in% BRANCHES_IGNORABLE),]
  
  return (df)
}

Converts the time fields to Date objects. Creates the date offset column using the difference between the transaction time and the book release date. Filters the transaction to include only those between min and max times.

# Time filter
time_filter <- function (book, df, min, max) {
  # Look up the date of the book release
  date <- BOOK_LAUNCH[BOOK_LAUNCH$Book==book,2]
  datePx <- as.POSIXct(as.Date(date))
  if (book %in% c("AM", "GB", "KC","TC")) {
    dateFormat <- "%Y/%m/%d"
  } else {
    dateFormat <- "%m/%d/%Y"
  }
  # Create a date offset column, which is date - release date
  df$DateOffset <- difftime(as.POSIXct(as.Date(df$Date, format=dateFormat)), 
                            datePx, 
                            units="days")
  # Remove transactions before min
  df <- df[df$DateOffset>=min,]
  
  # Remove transactions after max
  df <- df[df$DateOffset<=max,]
  
  return (df)
}

The main function

Loads and process the transaction data.

# Processes all the transactions into a single data frame
event_transactions <- function(types, min, max) {
  trans.files <- list()

  for (book in BOOK_PATHS[,1]) {
    df <- load_transactions(book)
    df.canon <- canonicalize_transactions(book, df)
    df.branch <- branch_filter(df.canon)
    df.trans <- transaction_filter(df.branch, types)
    df.time <- time_filter(book, df.trans, min, max)
  
    trans.files[[book]] <- df.time
  }

  # Assemble all the listed data frames into one. 
  transdf <- ldply(trans.files, .id="Book")

  # Convert the Book, Code, and Type data to factors
  transdf$Book <- factor(transdf$Book)
  transdf$Code <- factor(transdf$Code)
  transdf$Type <- factor(transdf$Type)
  
  return (transdf)
}

Creating the total transactions per branch table

Get all the transactions.

transdf <- event_transactions(c("CH"), 0, 365)

Make table and convert back to data frame

trans.tbl <- ftable(xtabs(~ Code + Book + Type, data=transdf))
trans.tbl.df <- as.data.frame(trans.tbl)

Save the data frame

path.out <- paste(RCR_DATA, "circulation-total-", 
                             DATA_VERSION, ".csv", sep="")

write.csv(trans.tbl.df, path.out, row.names=FALSE)

Creating the daily transaction data

Get all the transactions. This time include 6 months before to one year after and use only charged transactions.

dailydf <- event_transactions(c("CH"), -180, 360)

Create a table with the daily transactions for each book / branch combination.

daily.tbl <- ftable(xtabs(~ DateOffset + Code + Book, data=dailydf,
                          drop.unused.levels = TRUE))
daily.tbl.df <- as.data.frame(daily.tbl)

Remove combinations with zero checkouts. Much smaller file.

daily.tbl.df <- daily.tbl.df[daily.tbl.df$Freq>0,]

Save the file to circulation-daily-VERSION.csv.

path.out <- paste(RCR_DATA, "circulation-daily-", 
                  DATA_VERSION, ".csv", sep="")

write.csv(dailydf, path.out, row.names=FALSE)