Creates a principal components representation of the demographic data.

Load the code from the R file

library(knitr)
setwd("/home/rburke/oboc/src/rcr-analysis/src/demographic")
read_chunk("components.R")

Load project constants

# Load project constants
setwd("/home/rburke/oboc/src/rcr-analysis/src")
source("common.R")

Load libraries

# Load libraries
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)

Load demographic data

# Load demographic data
path <- paste(RCR_DATA, "branch/branch-data-reduced-", 
              DATA_VERSION, ".csv", sep="")
branch.demo <- read.csv(path)

Extract demographic features

Skip the library features, such as circulation and visitors

# Extract demographic features
features <- branch.demo[,12:95]
features.names <- colnames(features)
branch.remain <- branch.demo[,1:11] # Save for later output

Remove features with high correlations

# Remove features with high correlations
set.seed(1218)
features.cor <- cor(features)
highcorr <- findCorrelation(features.cor, cutoff=0.90, verbose=FALSE, 
                            names=FALSE, exact=TRUE)
features.filtered <- features[,-highcorr]

Remove columns with low incidence

Less than 1% of population

# Remove columns with low incidence
lowcols <- apply(features.filtered, 2, max)>=1
features.filtered <- features.filtered[,lowcols]

Compute principal components

# Compute principal components
set.seed(1218)
features.components <- prcomp(features.filtered, center = TRUE, scale.=TRUE)
summary(features.components)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     4.6225 3.4328 2.9895 1.79520 1.53971 1.24215
## Proportion of Variance 0.3561 0.1964 0.1489 0.05371 0.03951 0.02572
## Cumulative Proportion  0.3561 0.5525 0.7015 0.75520 0.79471 0.82043
##                            PC7     PC8     PC9    PC10    PC11    PC12
## Standard deviation     1.17959 1.03670 0.95095 0.89596 0.82651 0.77127
## Proportion of Variance 0.02319 0.01791 0.01507 0.01338 0.01139 0.00991
## Cumulative Proportion  0.84362 0.86153 0.87660 0.88998 0.90137 0.91128
##                           PC13    PC14    PC15   PC16    PC17    PC18
## Standard deviation     0.74442 0.71808 0.67738 0.6340 0.60530 0.56771
## Proportion of Variance 0.00924 0.00859 0.00765 0.0067 0.00611 0.00537
## Cumulative Proportion  0.92052 0.92911 0.93676 0.9435 0.94956 0.95494
##                           PC19    PC20    PC21    PC22   PC23   PC24
## Standard deviation     0.51904 0.49461 0.48468 0.43542 0.4102 0.4023
## Proportion of Variance 0.00449 0.00408 0.00392 0.00316 0.0028 0.0027
## Cumulative Proportion  0.95943 0.96350 0.96742 0.97058 0.9734 0.9761
##                           PC25    PC26    PC27    PC28    PC29   PC30
## Standard deviation     0.38878 0.37203 0.34740 0.32388 0.31126 0.3098
## Proportion of Variance 0.00252 0.00231 0.00201 0.00175 0.00161 0.0016
## Cumulative Proportion  0.97860 0.98091 0.98292 0.98467 0.98628 0.9879
##                           PC31    PC32    PC33    PC34    PC35    PC36
## Standard deviation     0.28684 0.27350 0.26653 0.24808 0.24295 0.22600
## Proportion of Variance 0.00137 0.00125 0.00118 0.00103 0.00098 0.00085
## Cumulative Proportion  0.98925 0.99050 0.99168 0.99271 0.99369 0.99454
##                           PC37    PC38   PC39    PC40    PC41    PC42
## Standard deviation     0.21679 0.20981 0.1903 0.17102 0.15849 0.15082
## Proportion of Variance 0.00078 0.00073 0.0006 0.00049 0.00042 0.00038
## Cumulative Proportion  0.99533 0.99606 0.9967 0.99715 0.99757 0.99795
##                           PC43    PC44    PC45    PC46    PC47    PC48
## Standard deviation     0.14021 0.12988 0.12611 0.12022 0.11304 0.09772
## Proportion of Variance 0.00033 0.00028 0.00027 0.00024 0.00021 0.00016
## Cumulative Proportion  0.99828 0.99856 0.99882 0.99906 0.99928 0.99944
##                           PC49    PC50   PC51    PC52    PC53    PC54
## Standard deviation     0.09460 0.08927 0.0758 0.06265 0.05048 0.04332
## Proportion of Variance 0.00015 0.00013 0.0001 0.00007 0.00004 0.00003
## Cumulative Proportion  0.99958 0.99972 0.9998 0.99988 0.99992 0.99995
##                           PC55    PC56     PC57      PC58      PC59
## Standard deviation     0.03993 0.03528 0.006292 0.0003402 0.0002182
## Proportion of Variance 0.00003 0.00002 0.000000 0.0000000 0.0000000
## Cumulative Proportion  0.99998 1.00000 1.000000 1.0000000 1.0000000
##                             PC60
## Standard deviation     5.559e-05
## Proportion of Variance 0.000e+00
## Cumulative Proportion  1.000e+00

Component heat map

This function display a one-column heat map of one of the principal components showing the contribution of each demograpic feature. The threshold removes dimensions with a smaller contribution.

componentMap <- function(comp, thresh)
{
  overThresh <- which(abs(comp)>thresh)
  comp.filtered <- comp[overThresh]
  comp.filtered <- comp.filtered[order(comp.filtered, decreasing=FALSE)]
  
  df <- data.frame(x=c("Component"),
                   y=factor(names(comp.filtered), levels=names(comp.filtered)),
                   value=comp.filtered)

  p <- ggplot(df, aes(x=x, y=y, fill=value))
  p <- p + geom_tile()
  p <- p + labs(x="",y="")
  p <- p + scale_x_discrete(expand=c(0,0))
  p <- p + scale_fill_gradientn(name="",
                      colours=colorRampPalette(c('blue', 'white', 'red'))(12))
  print(p)
}

Principal component 1

componentMap(features.components$rotation[,1], 0.1)

Note the strong (positive) contribution for:

Note the strong (negative) contribution for:

This axis has to do with affluence.

componentMap(features.components$rotation[,2], 0.1)

Note the strong (positive) contribution for:

Note the strong (negative) contribution for:

This axis corresponds to the division between Hispanic and Black areas.

componentMap(features.components$rotation[,3], 0.1)