Preliminaries: Load Libraries

library(tidycensus)
library(tidyverse)
library(sf)
library(tmap)

Census Data Extraction: Basics

Extracting a table of variable codes

# Calls up variable codes from 2010 decennial census and assigns to object named "decennial_variables"
decennial_variables<-load_variables(2010,"sf1")
# Prints contents of "decennial_variables"
decennial_variables
## # A tibble: 8,959 × 3
##    name    label                                concept         
##    <chr>   <chr>                                <chr>           
##  1 H001001 Total                                HOUSING UNITS   
##  2 H002001 Total                                URBAN AND RURAL 
##  3 H002002 Total!!Urban                         URBAN AND RURAL 
##  4 H002003 Total!!Urban!!Inside urbanized areas URBAN AND RURAL 
##  5 H002004 Total!!Urban!!Inside urban clusters  URBAN AND RURAL 
##  6 H002005 Total!!Rural                         URBAN AND RURAL 
##  7 H002006 Total!!Not defined for this file     URBAN AND RURAL 
##  8 H003001 Total                                OCCUPANCY STATUS
##  9 H003002 Total!!Occupied                      OCCUPANCY STATUS
## 10 H003003 Total!!Vacant                        OCCUPANCY STATUS
## # … with 8,949 more rows

Extracting data with get_decennial

Example 1

# Generate a table of population by state, based on the 2010 decennial census
state_population_2010<-get_decennial(geography = "state", 
                                     variables = "P001001", 
                                     geometry=TRUE,
                                     year = 2010)
# prints "state_population_2010"
state_population_2010
## Simple feature collection with 52 features and 4 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -179.1473 ymin: 17.88481 xmax: 179.7785 ymax: 71.35256
## geographic CRS: NAD83
## # A tibble: 52 × 5
##    GEOID NAME           variable    value                               geometry
##    <chr> <chr>          <chr>       <dbl>                     <MULTIPOLYGON [°]>
##  1 23    Maine          P001001   1328361 (((-67.61976 44.51975, -67.61541 44.5…
##  2 25    Massachusetts  P001001   6547629 (((-70.83204 41.6065, -70.82373 41.59…
##  3 26    Michigan       P001001   9883640 (((-88.68443 48.11578, -88.67563 48.1…
##  4 30    Montana        P001001    989415 (((-104.0577 44.99743, -104.2501 44.9…
##  5 32    Nevada         P001001   2700551 (((-114.0506 37.0004, -114.05 36.9577…
##  6 34    New Jersey     P001001   8791894 (((-75.52684 39.65571, -75.52634 39.6…
##  7 36    New York       P001001  19378102 (((-71.94356 41.28667, -71.9268 41.29…
##  8 37    North Carolina P001001   9535483 (((-82.60288 36.03983, -82.60074 36.0…
##  9 39    Ohio           P001001  11536504 (((-82.81349 41.72347, -82.81049 41.7…
## 10 42    Pennsylvania   P001001  12702379 (((-75.41504 39.80179, -75.42804 39.8…
## # … with 42 more rows

Example 2

# Generate a table of the population distribution across Colorado counties in the year 2010, based on the 2010 decennial census

CO_county_population_2010<-get_decennial(geography = "county", 
                                         state="CO",
                                         variables = "P001001", 
                                         year = 2010)
# Prints "CO_county_population_2010"
CO_county_population_2010
## # A tibble: 64 × 4
##    GEOID NAME                      variable  value
##    <chr> <chr>                     <chr>     <dbl>
##  1 08023 Costilla County, Colorado P001001    3524
##  2 08025 Crowley County, Colorado  P001001    5823
##  3 08027 Custer County, Colorado   P001001    4255
##  4 08029 Delta County, Colorado    P001001   30952
##  5 08031 Denver County, Colorado   P001001  600158
##  6 08035 Douglas County, Colorado  P001001  285465
##  7 08033 Dolores County, Colorado  P001001    2064
##  8 08049 Grand County, Colorado    P001001   14843
##  9 08039 Elbert County, Colorado   P001001   23086
## 10 08041 El Paso County, Colorado  P001001  622263
## # … with 54 more rows

Cleaning Extracted Data

# Clean up the table of CO population by county by removing the "variable" column, and renaming the "value" column as "Population", which we can do using the tidyverse's "dplyr" package

CO_county_population_2010<-CO_county_population_2010 %>% 
                           mutate(variable=NULL) %>% 
                           rename(population=value)
# prints updated contents of "CO_county_population_2010"
CO_county_population_2010
## # A tibble: 64 × 3
##    GEOID NAME                      population
##    <chr> <chr>                          <dbl>
##  1 08023 Costilla County, Colorado       3524
##  2 08025 Crowley County, Colorado        5823
##  3 08027 Custer County, Colorado         4255
##  4 08029 Delta County, Colorado         30952
##  5 08031 Denver County, Colorado       600158
##  6 08035 Douglas County, Colorado      285465
##  7 08033 Dolores County, Colorado        2064
##  8 08049 Grand County, Colorado         14843
##  9 08039 Elbert County, Colorado        23086
## 10 08041 El Paso County, Colorado      622263
## # … with 54 more rows

Extracting Multiple Census Variables

# Create a new object containing a dataset of rural population AND overall population by state,based on the 2010 decennial. Also, renames the variables, and arrange the dataset in descending order with respect to the rural population

state_pop_ruralpop_2010<-get_decennial(geography = "state", 
                                       variables = c("P001001", "P002005"),
                                       output="wide",
                                       year = 2010) %>% 
                          rename(total_population=P001001, rural_population=P002005) %>% 
                          arrange(desc(rural_population))
## Getting data from the 2010 decennial Census
## Using Census Summary File 1
# prints contents of "state_pop_ruralpop_2010"
state_pop_ruralpop_2010
## # A tibble: 52 × 4
##    GEOID NAME           total_population rural_population
##    <chr> <chr>                     <dbl>            <dbl>
##  1 48    Texas                  25145561          3847522
##  2 37    North Carolina          9535483          3233727
##  3 42    Pennsylvania           12702379          2711092
##  4 39    Ohio                   11536504          2546810
##  5 26    Michigan                9883640          2513683
##  6 13    Georgia                 9687653          2415502
##  7 36    New York               19378102          2349997
##  8 47    Tennessee               6346105          2132860
##  9 51    Virginia                8001024          1963930
## 10 01    Alabama                 4779736          1957932
## # … with 42 more rows

Creating New Variables

#Using dplyr, generate a new variable in the dataset based on existing variables; 
# in particular, create a variable called "rural_pct" that reflects the rural population 
# as a percentage of the population, and then arranges it in descrend order
# with respect to the new "rural_pct" variable

state_pop_ruralpop_2010<- state_pop_ruralpop_2010 %>% 
                            mutate(rural_pct=(rural_population/total_population)*100) %>% 
                            arrange(desc(rural_pct))
# prints "state_pop_ruralpop_2010"
state_pop_ruralpop_2010
## # A tibble: 52 × 5
##    GEOID NAME          total_population rural_population rural_pct
##    <chr> <chr>                    <dbl>            <dbl>     <dbl>
##  1 23    Maine                  1328361           814819      61.3
##  2 50    Vermont                 625741           382356      61.1
##  3 54    West Virginia          1852994           950184      51.3
##  4 28    Mississippi            2967297          1503073      50.7
##  5 30    Montana                 989415           436401      44.1
##  6 05    Arkansas               2915918          1278329      43.8
##  7 46    South Dakota            814180           352933      43.3
##  8 21    Kentucky               4339367          1806024      41.6
##  9 01    Alabama                4779736          1957932      41.0
## 10 38    North Dakota            672591           269719      40.1
## # … with 42 more rows

Filtering census datasets

# Extracts observations from "state_pop_ruralpop_2010" where rural_pct>40 and assigns to a new object named "rural_pct_over40"

rural_pct_over40<-state_pop_ruralpop_2010 %>%
                  filter(rural_pct>40)
rural_pct_over40
## # A tibble: 10 × 5
##    GEOID NAME          total_population rural_population rural_pct
##    <chr> <chr>                    <dbl>            <dbl>     <dbl>
##  1 23    Maine                  1328361           814819      61.3
##  2 50    Vermont                 625741           382356      61.1
##  3 54    West Virginia          1852994           950184      51.3
##  4 28    Mississippi            2967297          1503073      50.7
##  5 30    Montana                 989415           436401      44.1
##  6 05    Arkansas               2915918          1278329      43.8
##  7 46    South Dakota            814180           352933      43.3
##  8 21    Kentucky               4339367          1806024      41.6
##  9 01    Alabama                4779736          1957932      41.0
## 10 38    North Dakota            672591           269719      40.1

Student Exercise 1: Create a dataset of Colorado counties whose rural population percentage(with respect to the overall county population) exceededs 50% (based on the 2010 decennialcensus). Sort the dataset in descending order with respect to the rural percentage variable.

Census Data Visualization

Using ggplot to visualize census data

Make a graph that visually conveys the median age in Colorado, by county, based on the 2010 census.

# Extracts Colorado median age dataset
median_age_CO<- get_decennial(geography = "county",
                              state="CO",
                              variables = "P013001", 
                              geometry=TRUE,
                              year = 2010) %>% 
                rename(median_age=value) %>% 
                mutate(County=str_remove(NAME, " County, Colorado")) %>% 
                select(-NAME)
median_age_CO
## Simple feature collection with 64 features and 4 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -109.0603 ymin: 36.99243 xmax: -102.0415 ymax: 41.00344
## geographic CRS: NAD83
## # A tibble: 64 × 5
##    GEOID variable median_age                                     geometry County
##    <chr> <chr>         <dbl>                           <MULTIPOLYGON [°]> <chr> 
##  1 08053 P013001        47.1 (((-107.2116 37.42296, -107.2179 37.42298, … Hinsd…
##  2 08061 P013001        47.4 (((-102.1985 38.61522, -102.0688 38.61518, … Kiowa 
##  3 08063 P013001        39.7 (((-102.0498 39.57406, -102.0496 39.53893, … Kit C…
##  4 08071 P013001        43.9 (((-104.4804 36.99372, -104.5193 36.99377, … Las A…
##  5 08073 P013001        40.9 (((-103.7149 39.26751, -103.7149 39.2677, -… Linco…
##  6 08075 P013001        38.4 (((-102.9049 41.00221, -102.9048 41.00221, … Logan 
##  7 08079 P013001        53.1 (((-106.7108 37.40423, -106.7108 37.39624, … Miner…
##  8 08085 P013001        42   (((-109.0418 38.15302, -109.0418 38.16469, … Montr…
##  9 08087 P013001        36   (((-104.0706 40.5243, -104.0514 40.52432, -… Morgan
## 10 08089 P013001        40.9 (((-104.0597 37.85263, -104.0594 37.99616, … Otero 
## # … with 54 more rows
# Creates ggplot visualization of CO median age
median_age_CO__visualization<-
  median_age_CO %>%
  ggplot(aes(x = median_age, y = reorder(County, median_age))) + 
  geom_point()+
  labs(title="Median Age by County, CO", x="Median Age", y="County", caption="Source: United States census accessed via tidycensus")+
  theme(plot.title=element_text(hjust=0.5),
        plot.caption=element_text(size=5))
median_age_CO__visualization

Using tmap to visualize census data

Static Map

# Makes map object of median age
median_age_CO_map<-
  tm_shape(median_age_CO)+
      tm_polygons(col="median_age",
                  breaks=c(30,35,40,45,50),
                  palette="YlGnBu", 
                  midpoint=TRUE)+
       tm_layout(frame=FALSE, 
                 main.title="Median Age by County,\nColorado",  
                main.title.position="left", 
                legend.outside=TRUE,
                attr.outside=TRUE)+
      tm_credits("Source: US Census via tidycensus", position=c("right", "bottom"))
median_age_CO_map
## Warning: Values have found that are higher than the highest break

Webmap

# changes tmap_mode to "View"
tmap_mode("view")
## tmap mode set to interactive viewing
# prints "median_age_CO_map" in "View" mode
median_age_CO_map
## Credits not supported in view mode.
## Warning: Values have found that are higher than the highest break
# changes mode to "lot"
tmap_mode("plot")
## tmap mode set to plotting
# prints "median_age_CO_map" in "Plot" mode
median_age_CO_map
## Warning: Values have found that are higher than the highest break

Extracting American Community Survey data

# Extract 5-year ACS data variable codes for year ending in 2018
ACS_5_2018<-load_variables(2018,"acs5")
# prints "ACS_5_2018"
ACS_5_2018
## # A tibble: 26,997 × 3
##    name       label                                  concept                    
##    <chr>      <chr>                                  <chr>                      
##  1 B00001_001 Estimate!!Total                        UNWEIGHTED SAMPLE COUNT OF…
##  2 B00002_001 Estimate!!Total                        UNWEIGHTED SAMPLE HOUSING …
##  3 B01001_001 Estimate!!Total                        SEX BY AGE                 
##  4 B01001_002 Estimate!!Total!!Male                  SEX BY AGE                 
##  5 B01001_003 Estimate!!Total!!Male!!Under 5 years   SEX BY AGE                 
##  6 B01001_004 Estimate!!Total!!Male!!5 to 9 years    SEX BY AGE                 
##  7 B01001_005 Estimate!!Total!!Male!!10 to 14 years  SEX BY AGE                 
##  8 B01001_006 Estimate!!Total!!Male!!15 to 17 years  SEX BY AGE                 
##  9 B01001_007 Estimate!!Total!!Male!!18 and 19 years SEX BY AGE                 
## 10 B01001_008 Estimate!!Total!!Male!!20 years        SEX BY AGE                 
## # … with 26,987 more rows
# Uses 2018 ACS to extract median income estimates by county for CO
median_income_CO_counties_2018<-get_acs(geography="county",
                                     state="CO",
                                     variables="B19013_001",
                                     year=2018) %>% 
                            rename(median_income=estimate) %>% 
                            arrange(desc(median_income))
## Getting data from the 2014-2018 5-year ACS
median_income_CO_counties_2018
## # A tibble: 64 × 5
##    GEOID NAME                        variable   median_income   moe
##    <chr> <chr>                       <chr>              <dbl> <dbl>
##  1 08035 Douglas County, Colorado    B19013_001        115314  2028
##  2 08039 Elbert County, Colorado     B19013_001         96658  4279
##  3 08014 Broomfield County, Colorado B19013_001         89624  4013
##  4 08037 Eagle County, Colorado      B19013_001         84685  4478
##  5 08059 Jefferson County, Colorado  B19013_001         78943  1142
##  6 08013 Boulder County, Colorado    B19013_001         78642  1583
##  7 08117 Summit County, Colorado     B19013_001         77589  4772
##  8 08047 Gilpin County, Colorado     B19013_001         75120  6107
##  9 08107 Routt County, Colorado      B19013_001         74273  3839
## 10 08005 Arapahoe County, Colorado   B19013_001         73925   902
## # … with 54 more rows

ACS Data Visualization

# Creates point visualization of median income by county with error bars
median_income_CO_counties_2018_viz<-
     median_income_CO_counties_2018 %>% 
        mutate(County_Name=str_remove_all(NAME,"County, Colorado")) %>% 
          ggplot(aes(x=median_income,y=reorder(County_Name, median_income)))+
           geom_errorbarh(aes(xmin = median_income - moe, xmax = median_income + moe)) +
           geom_point(color = "blue", size = 3)+
                      labs(title="Median Income in Colorado, by County (2018)",
                              y="", x="Median Income Estimate from 5 year ACS\n(Bars indicate margin of error)")+
                             theme(plot.title=element_text(hjust=0.5))
# prints "median_income_CO_counties_2018_viz"
median_income_CO_counties_2018_viz