Learning Objectives

By the end of this practical lab you will be able to:

Basic numeric summary statistics

In the first part of this practical we will import some Airbnb data for Amsterdam that was collected during July, 2016.

# Read in CSV
amsterdam <- read.csv("./data/listings.csv")

This contains a wide array of variables:

colnames(amsterdam)
##  [1] "id"                               "listing_url"                     
##  [3] "scrape_id"                        "last_scraped"                    
##  [5] "name"                             "summary"                         
##  [7] "space"                            "description"                     
##  [9] "experiences_offered"              "neighborhood_overview"           
## [11] "notes"                            "transit"                         
## [13] "access"                           "interaction"                     
## [15] "house_rules"                      "thumbnail_url"                   
## [17] "medium_url"                       "picture_url"                     
## [19] "xl_picture_url"                   "host_id"                         
## [21] "host_url"                         "host_name"                       
## [23] "host_since"                       "host_location"                   
## [25] "host_about"                       "host_response_time"              
## [27] "host_response_rate"               "host_acceptance_rate"            
## [29] "host_is_superhost"                "host_thumbnail_url"              
## [31] "host_picture_url"                 "host_neighbourhood"              
## [33] "host_listings_count"              "host_total_listings_count"       
## [35] "host_verifications"               "host_has_profile_pic"            
## [37] "host_identity_verified"           "street"                          
## [39] "neighbourhood"                    "neighbourhood_cleansed"          
## [41] "neighbourhood_group_cleansed"     "city"                            
## [43] "state"                            "zipcode"                         
## [45] "market"                           "smart_location"                  
## [47] "country_code"                     "country"                         
## [49] "latitude"                         "longitude"                       
## [51] "is_location_exact"                "property_type"                   
## [53] "room_type"                        "accommodates"                    
## [55] "bathrooms"                        "bedrooms"                        
## [57] "beds"                             "bed_type"                        
## [59] "amenities"                        "square_feet"                     
## [61] "price"                            "weekly_price"                    
## [63] "monthly_price"                    "security_deposit"                
## [65] "cleaning_fee"                     "guests_included"                 
## [67] "extra_people"                     "minimum_nights"                  
## [69] "maximum_nights"                   "calendar_updated"                
## [71] "has_availability"                 "availability_30"                 
## [73] "availability_60"                  "availability_90"                 
## [75] "availability_365"                 "calendar_last_scraped"           
## [77] "number_of_reviews"                "first_review"                    
## [79] "last_review"                      "review_scores_rating"            
## [81] "review_scores_accuracy"           "review_scores_cleanliness"       
## [83] "review_scores_checkin"            "review_scores_communication"     
## [85] "review_scores_location"           "review_scores_value"             
## [87] "requires_license"                 "license"                         
## [89] "jurisdiction_names"               "instant_bookable"                
## [91] "cancellation_policy"              "require_guest_profile_picture"   
## [93] "require_guest_phone_verification" "calculated_host_listings_count"  
## [95] "reviews_per_month"

However, for this practical we will subset these data to a limited set of variables:

# Subset data
amsterdam <- subset(amsterdam, select = c("id","neighbourhood_cleansed","latitude","longitude","property_type","room_type","bedrooms","price","number_of_reviews"))

# Clean the price data to remove $ and , then convert to numeric
amsterdam$price <- gsub("$","",amsterdam$price,fixed = TRUE)
amsterdam$price <- gsub(",","",amsterdam$price,fixed = TRUE)
amsterdam$price <- as.numeric(as.character(amsterdam$price))

#Remove any records that are not complete
amsterdam <- amsterdam[complete.cases(amsterdam),]

# Show the top six rows
head(amsterdam)
##         id neighbourhood_cleansed latitude longitude property_type
## 1  9784567           Bijlmer-Oost 52.32337  4.977730     Apartment
## 2 13163805           Bijlmer-Oost 52.31564  4.978166     Apartment
## 3  3719167           Bijlmer-Oost 52.31671  4.986618         House
## 4 13188367           Bijlmer-Oost 52.32120  4.976630     Apartment
## 5  4831606           Bijlmer-Oost 52.32727  4.967352         House
## 6 13727930           Bijlmer-Oost 52.31459  4.975385     Apartment
##         room_type bedrooms price number_of_reviews
## 1    Private room        1    95                 5
## 2    Private room        1    60                 4
## 3    Private room        1    45                11
## 4    Private room        1    35                 2
## 5    Private room        1    39                12
## 6 Entire home/apt        0    80                 0

The simplest way to create a set of descriptive statistics is to use the summary() function which returns the minimum, maximum, first and third quartile, and the mean and median for each column. For non numeric columns (e.g. neighbourhood_cleansed),the frequencies of the top seven most frequent attributes are shown.

#Summary
summary(amsterdam)
##        id                       neighbourhood_cleansed    latitude    
##  Min.   :    2818   De Baarsjes - Oud-West :2430       Min.   :52.29  
##  1st Qu.: 3736210   Centrum-West           :1878       1st Qu.:52.36  
##  Median : 7555967   De Pijp - Rivierenbuurt:1619       Median :52.37  
##  Mean   : 7418371   Centrum-Oost           :1345       Mean   :52.37  
##  3rd Qu.:11267148   Westerpark             :1029       3rd Qu.:52.37  
##  Max.   :13828444   Zuid                   :1008       Max.   :52.43  
##                     (Other)                :4528                      
##    longitude             property_type             room_type    
##  Min.   :4.763   Apartment      :10954   Entire home/apt:11157  
##  1st Qu.:4.866   House          : 1459   Private room   : 2609  
##  Median :4.886   Boat           :  433   Shared room    :   71  
##  Mean   :4.889   Bed & Breakfast:  392                          
##  3rd Qu.:4.907   Condominium    :  207                          
##  Max.   :5.028   Townhouse      :  155                          
##                  (Other)        :  237                          
##     bedrooms          price        number_of_reviews
##  Min.   : 0.000   Min.   :  10.0   Min.   :  0.00   
##  1st Qu.: 1.000   1st Qu.:  89.0   1st Qu.:  1.00   
##  Median : 1.000   Median : 115.0   Median :  6.00   
##  Mean   : 1.428   Mean   : 132.8   Mean   : 15.42   
##  3rd Qu.: 2.000   3rd Qu.: 150.0   3rd Qu.: 16.00   
##  Max.   :10.000   Max.   :8616.0   Max.   :416.00   
## 

There are numerous descriptive statistics available individually within R - here we illustrate these by exploring the “price” attribute:

#Mean
mean(amsterdam$price)
## [1] 132.8304
# Median
median(amsterdam$price)
## [1] 115
# Standard deviation
sd(amsterdam$price)
## [1] 105.3017
# Min, Max, Range
min(amsterdam$price)
## [1] 10
max(amsterdam$price)
## [1] 8616
range(amsterdam$price)
## [1]   10 8616
# Quantiles
quantile(amsterdam$price)
##   0%  25%  50%  75% 100% 
##   10   89  115  150 8616

Frequency tables

Another common basic analysis task is to create a frequency table for a categorical variable - for example, the number of listings per neighborhood. This can be achieved using the table() function which prints a list of the unique attributes and the frequency of these observations.

table(amsterdam$neighbourhood_cleansed)
## 
##                        Bijlmer-Centrum 
##                                     61 
##                           Bijlmer-Oost 
##                                     72 
##                          Bos en Lommer 
##                                    695 
##                 Buitenveldert - Zuidas 
##                                    160 
##                           Centrum-Oost 
##                                   1345 
##                           Centrum-West 
##                                   1878 
##                 De Aker - Nieuw Sloten 
##                                     91 
##                 De Baarsjes - Oud-West 
##                                   2430 
##                De Pijp - Rivierenbuurt 
##                                   1619 
##                  Gaasperdam - Driemond 
##                                     63 
##                Geuzenveld - Slotermeer 
##                                    138 
##               IJburg - Zeeburgereiland 
##                                    272 
##                             Noord-Oost 
##                                    159 
##                             Noord-West 
##                                    186 
## Oostelijk Havengebied - Indische Buurt 
##                                    694 
##                                 Osdorp 
##                                     99 
##                              Oud-Noord 
##                                    390 
##                               Oud-Oost 
##                                    806 
##                            Slotervaart 
##                                    280 
##                        Watergraafsmeer 
##                                    362 
##                             Westerpark 
##                                   1029 
##                                   Zuid 
##                                   1008

It is also possible to create cross-tabulations; showing the frequency of two attributes - here we look at neighborhood and bedrooms:

# Cross-Tabulation
table(amsterdam$neighbourhood_cleansed,amsterdam$bedrooms)
##                                         
##                                             0    1    2    3    4    5
##   Bijlmer-Centrum                           5   54    0    2    0    0
##   Bijlmer-Oost                              3   49   17    3    0    0
##   Bos en Lommer                            49  453  154   35    4    0
##   Buitenveldert - Zuidas                    4   94   40   14    7    1
##   Centrum-Oost                             88  792  333   95   29    6
##   Centrum-West                            164 1167  420   86   32    7
##   De Aker - Nieuw Sloten                    6   48   21   12    4    0
##   De Baarsjes - Oud-West                   80 1607  559  139   36    7
##   De Pijp - Rivierenbuurt                  81  993  409  112   21    3
##   Gaasperdam - Driemond                     0   48    8    5    2    0
##   Geuzenveld - Slotermeer                   0   96   24   10    7    1
##   IJburg - Zeeburgereiland                  9   95   63   58   40    5
##   Noord-Oost                                6   74   33   28   18    0
##   Noord-West                                9   90   43   28   15    0
##   Oostelijk Havengebied - Indische Buurt   26  428  165   55   11    3
##   Osdorp                                    2   66   25    5    1    0
##   Oud-Noord                                18  190   99   54   18    2
##   Oud-Oost                                 27  528  188   49   12    1
##   Slotervaart                               8  175   69   25    3    0
##   Watergraafsmeer                          56  157   71   55   22    1
##   Westerpark                               49  706  204   57    5    1
##   Zuid                                     27  590  246   90   43   10
##                                         
##                                             6    7    8    9   10
##   Bijlmer-Centrum                           0    0    0    0    0
##   Bijlmer-Oost                              0    0    0    0    0
##   Bos en Lommer                             0    0    0    0    0
##   Buitenveldert - Zuidas                    0    0    0    0    0
##   Centrum-Oost                              1    0    0    0    1
##   Centrum-West                              2    0    0    0    0
##   De Aker - Nieuw Sloten                    0    0    0    0    0
##   De Baarsjes - Oud-West                    2    0    0    0    0
##   De Pijp - Rivierenbuurt                   0    0    0    0    0
##   Gaasperdam - Driemond                     0    0    0    0    0
##   Geuzenveld - Slotermeer                   0    0    0    0    0
##   IJburg - Zeeburgereiland                  0    0    1    1    0
##   Noord-Oost                                0    0    0    0    0
##   Noord-West                                0    0    0    0    1
##   Oostelijk Havengebied - Indische Buurt    3    0    0    2    1
##   Osdorp                                    0    0    0    0    0
##   Oud-Noord                                 1    2    2    0    4
##   Oud-Oost                                  0    1    0    0    0
##   Slotervaart                               0    0    0    0    0
##   Watergraafsmeer                           0    0    0    0    0
##   Westerpark                                1    1    0    0    5
##   Zuid                                      2    0    0    0    0

We might want to store this as a new object which we can do in the usual way - however, if you look at the table structure (e.g. View(neigh_bedrooms) you will see that it is stored in narrow format rather than how the table is printed:

# Cross-Tabulation
neigh_bedrooms <- table(amsterdam$neighbourhood_cleansed,amsterdam$bedrooms)

From a cross tabulation table object you can create row and column frequencies using the margin.table() function, and convert the counts into percentages of row or column totals with prop.table() - round() is also used to limit the number of decimal places displayed:

# Row frequencies
margin.table(neigh_bedrooms,1)
## 
##                        Bijlmer-Centrum 
##                                     61 
##                           Bijlmer-Oost 
##                                     72 
##                          Bos en Lommer 
##                                    695 
##                 Buitenveldert - Zuidas 
##                                    160 
##                           Centrum-Oost 
##                                   1345 
##                           Centrum-West 
##                                   1878 
##                 De Aker - Nieuw Sloten 
##                                     91 
##                 De Baarsjes - Oud-West 
##                                   2430 
##                De Pijp - Rivierenbuurt 
##                                   1619 
##                  Gaasperdam - Driemond 
##                                     63 
##                Geuzenveld - Slotermeer 
##                                    138 
##               IJburg - Zeeburgereiland 
##                                    272 
##                             Noord-Oost 
##                                    159 
##                             Noord-West 
##                                    186 
## Oostelijk Havengebied - Indische Buurt 
##                                    694 
##                                 Osdorp 
##                                     99 
##                              Oud-Noord 
##                                    390 
##                               Oud-Oost 
##                                    806 
##                            Slotervaart 
##                                    280 
##                        Watergraafsmeer 
##                                    362 
##                             Westerpark 
##                                   1029 
##                                   Zuid 
##                                   1008
# Column frequencies
margin.table(neigh_bedrooms,2)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##  717 8500 3191 1017  330   48   12    4    3    3   12
# Row percentages
round(prop.table(neigh_bedrooms,1),2)
##                                         
##                                             0    1    2    3    4    5
##   Bijlmer-Centrum                        0.08 0.89 0.00 0.03 0.00 0.00
##   Bijlmer-Oost                           0.04 0.68 0.24 0.04 0.00 0.00
##   Bos en Lommer                          0.07 0.65 0.22 0.05 0.01 0.00
##   Buitenveldert - Zuidas                 0.02 0.59 0.25 0.09 0.04 0.01
##   Centrum-Oost                           0.07 0.59 0.25 0.07 0.02 0.00
##   Centrum-West                           0.09 0.62 0.22 0.05 0.02 0.00
##   De Aker - Nieuw Sloten                 0.07 0.53 0.23 0.13 0.04 0.00
##   De Baarsjes - Oud-West                 0.03 0.66 0.23 0.06 0.01 0.00
##   De Pijp - Rivierenbuurt                0.05 0.61 0.25 0.07 0.01 0.00
##   Gaasperdam - Driemond                  0.00 0.76 0.13 0.08 0.03 0.00
##   Geuzenveld - Slotermeer                0.00 0.70 0.17 0.07 0.05 0.01
##   IJburg - Zeeburgereiland               0.03 0.35 0.23 0.21 0.15 0.02
##   Noord-Oost                             0.04 0.47 0.21 0.18 0.11 0.00
##   Noord-West                             0.05 0.48 0.23 0.15 0.08 0.00
##   Oostelijk Havengebied - Indische Buurt 0.04 0.62 0.24 0.08 0.02 0.00
##   Osdorp                                 0.02 0.67 0.25 0.05 0.01 0.00
##   Oud-Noord                              0.05 0.49 0.25 0.14 0.05 0.01
##   Oud-Oost                               0.03 0.66 0.23 0.06 0.01 0.00
##   Slotervaart                            0.03 0.62 0.25 0.09 0.01 0.00
##   Watergraafsmeer                        0.15 0.43 0.20 0.15 0.06 0.00
##   Westerpark                             0.05 0.69 0.20 0.06 0.00 0.00
##   Zuid                                   0.03 0.59 0.24 0.09 0.04 0.01
##                                         
##                                             6    7    8    9   10
##   Bijlmer-Centrum                        0.00 0.00 0.00 0.00 0.00
##   Bijlmer-Oost                           0.00 0.00 0.00 0.00 0.00
##   Bos en Lommer                          0.00 0.00 0.00 0.00 0.00
##   Buitenveldert - Zuidas                 0.00 0.00 0.00 0.00 0.00
##   Centrum-Oost                           0.00 0.00 0.00 0.00 0.00
##   Centrum-West                           0.00 0.00 0.00 0.00 0.00
##   De Aker - Nieuw Sloten                 0.00 0.00 0.00 0.00 0.00
##   De Baarsjes - Oud-West                 0.00 0.00 0.00 0.00 0.00
##   De Pijp - Rivierenbuurt                0.00 0.00 0.00 0.00 0.00
##   Gaasperdam - Driemond                  0.00 0.00 0.00 0.00 0.00
##   Geuzenveld - Slotermeer                0.00 0.00 0.00 0.00 0.00
##   IJburg - Zeeburgereiland               0.00 0.00 0.00 0.00 0.00
##   Noord-Oost                             0.00 0.00 0.00 0.00 0.00
##   Noord-West                             0.00 0.00 0.00 0.00 0.01
##   Oostelijk Havengebied - Indische Buurt 0.00 0.00 0.00 0.00 0.00
##   Osdorp                                 0.00 0.00 0.00 0.00 0.00
##   Oud-Noord                              0.00 0.01 0.01 0.00 0.01
##   Oud-Oost                               0.00 0.00 0.00 0.00 0.00
##   Slotervaart                            0.00 0.00 0.00 0.00 0.00
##   Watergraafsmeer                        0.00 0.00 0.00 0.00 0.00
##   Westerpark                             0.00 0.00 0.00 0.00 0.00
##   Zuid                                   0.00 0.00 0.00 0.00 0.00
# Column percentages
round(prop.table(neigh_bedrooms,2),2)
##                                         
##                                             0    1    2    3    4    5
##   Bijlmer-Centrum                        0.01 0.01 0.00 0.00 0.00 0.00
##   Bijlmer-Oost                           0.00 0.01 0.01 0.00 0.00 0.00
##   Bos en Lommer                          0.07 0.05 0.05 0.03 0.01 0.00
##   Buitenveldert - Zuidas                 0.01 0.01 0.01 0.01 0.02 0.02
##   Centrum-Oost                           0.12 0.09 0.10 0.09 0.09 0.12
##   Centrum-West                           0.23 0.14 0.13 0.08 0.10 0.15
##   De Aker - Nieuw Sloten                 0.01 0.01 0.01 0.01 0.01 0.00
##   De Baarsjes - Oud-West                 0.11 0.19 0.18 0.14 0.11 0.15
##   De Pijp - Rivierenbuurt                0.11 0.12 0.13 0.11 0.06 0.06
##   Gaasperdam - Driemond                  0.00 0.01 0.00 0.00 0.01 0.00
##   Geuzenveld - Slotermeer                0.00 0.01 0.01 0.01 0.02 0.02
##   IJburg - Zeeburgereiland               0.01 0.01 0.02 0.06 0.12 0.10
##   Noord-Oost                             0.01 0.01 0.01 0.03 0.05 0.00
##   Noord-West                             0.01 0.01 0.01 0.03 0.05 0.00
##   Oostelijk Havengebied - Indische Buurt 0.04 0.05 0.05 0.05 0.03 0.06
##   Osdorp                                 0.00 0.01 0.01 0.00 0.00 0.00
##   Oud-Noord                              0.03 0.02 0.03 0.05 0.05 0.04
##   Oud-Oost                               0.04 0.06 0.06 0.05 0.04 0.02
##   Slotervaart                            0.01 0.02 0.02 0.02 0.01 0.00
##   Watergraafsmeer                        0.08 0.02 0.02 0.05 0.07 0.02
##   Westerpark                             0.07 0.08 0.06 0.06 0.02 0.02
##   Zuid                                   0.04 0.07 0.08 0.09 0.13 0.21
##                                         
##                                             6    7    8    9   10
##   Bijlmer-Centrum                        0.00 0.00 0.00 0.00 0.00
##   Bijlmer-Oost                           0.00 0.00 0.00 0.00 0.00
##   Bos en Lommer                          0.00 0.00 0.00 0.00 0.00
##   Buitenveldert - Zuidas                 0.00 0.00 0.00 0.00 0.00
##   Centrum-Oost                           0.08 0.00 0.00 0.00 0.08
##   Centrum-West                           0.17 0.00 0.00 0.00 0.00
##   De Aker - Nieuw Sloten                 0.00 0.00 0.00 0.00 0.00
##   De Baarsjes - Oud-West                 0.17 0.00 0.00 0.00 0.00
##   De Pijp - Rivierenbuurt                0.00 0.00 0.00 0.00 0.00
##   Gaasperdam - Driemond                  0.00 0.00 0.00 0.00 0.00
##   Geuzenveld - Slotermeer                0.00 0.00 0.00 0.00 0.00
##   IJburg - Zeeburgereiland               0.00 0.00 0.33 0.33 0.00
##   Noord-Oost                             0.00 0.00 0.00 0.00 0.00
##   Noord-West                             0.00 0.00 0.00 0.00 0.08
##   Oostelijk Havengebied - Indische Buurt 0.25 0.00 0.00 0.67 0.08
##   Osdorp                                 0.00 0.00 0.00 0.00 0.00
##   Oud-Noord                              0.08 0.50 0.67 0.00 0.33
##   Oud-Oost                               0.00 0.25 0.00 0.00 0.00
##   Slotervaart                            0.00 0.00 0.00 0.00 0.00
##   Watergraafsmeer                        0.00 0.00 0.00 0.00 0.00
##   Westerpark                             0.08 0.25 0.00 0.00 0.42
##   Zuid                                   0.17 0.00 0.00 0.00 0.00

If you want to convert a “table” back into an object then this is fairly simple:

# Convert table to data frame
neigh_bedrooms_DF <- as.data.frame(neigh_bedrooms)

# View top 6 rows
head(neigh_bedrooms_DF)
##                     Var1 Var2 Freq
## 1        Bijlmer-Centrum    0    5
## 2           Bijlmer-Oost    0    3
## 3          Bos en Lommer    0   49
## 4 Buitenveldert - Zuidas    0    4
## 5           Centrum-Oost    0   88
## 6           Centrum-West    0  164

However, you will note that the data frame has been created in narrow rather than wide format (as displayed). To create a wide format you use a different function:

# Convert table to data frame
neigh_bedrooms_DF <- as.data.frame.matrix(neigh_bedrooms)

# View top 6 rows
head(neigh_bedrooms_DF)
##                          0    1   2  3  4 5 6 7 8 9 10
## Bijlmer-Centrum          5   54   0  2  0 0 0 0 0 0  0
## Bijlmer-Oost             3   49  17  3  0 0 0 0 0 0  0
## Bos en Lommer           49  453 154 35  4 0 0 0 0 0  0
## Buitenveldert - Zuidas   4   94  40 14  7 1 0 0 0 0  0
## Centrum-Oost            88  792 333 95 29 6 1 0 0 0  1
## Centrum-West           164 1167 420 86 32 7 2 0 0 0  0

Create aggregations and summary statistics

We have shown how we can use various descriptive summary functions for single columns, however, it is also possible to combine these with further functions which make these calculations within aggregations. This is especially useful for geographic application where you can create summaries by a defined area. In the following example we will use the function aggregate() to find out what the mean price is by neighborhood:

aggregate(x=amsterdam[,"price"],by=list(amsterdam[,"neighbourhood_cleansed"]),FUN=mean)
##                                   Group.1         x
## 1                         Bijlmer-Centrum  56.57377
## 2                            Bijlmer-Oost  72.65278
## 3                           Bos en Lommer  99.66619
## 4                  Buitenveldert - Zuidas 118.14375
## 5                            Centrum-Oost 160.51747
## 6                            Centrum-West 168.88765
## 7                  De Aker - Nieuw Sloten 119.82418
## 8                  De Baarsjes - Oud-West 124.60823
## 9                 De Pijp - Rivierenbuurt 131.37122
## 10                  Gaasperdam - Driemond  79.25397
## 11                Geuzenveld - Slotermeer  96.89130
## 12               IJburg - Zeeburgereiland 141.01471
## 13                             Noord-Oost 113.18868
## 14                             Noord-West  99.44086
## 15 Oostelijk Havengebied - Indische Buurt 118.76369
## 16                                 Osdorp  94.51515
## 17                              Oud-Noord 127.27949
## 18                               Oud-Oost 120.61663
## 19                            Slotervaart  95.60357
## 20                        Watergraafsmeer 123.14088
## 21                             Westerpark 122.60836
## 22                                   Zuid 151.09524

In this example you will see that the “by=” option only accepts a “list” which we can easily create using the function list(). We can add additional attributes to the list to get further sub-aggregations - however, we won’t display this here as it creates quite a large table.

aggregate(x=amsterdam[,"price"],by=list(amsterdam[,"neighbourhood_cleansed"],amsterdam[,"property_type"]),FUN=mean)

So far we have only considered the mean price, however, what if we wanted to create a number of statistics for each aggregation. We can do this quite simply, however, this requires that we write our own custom function. You have used lots of functions so far in these practicals, some have been built into base R, and others become available by loading packages.

The basic structure of a function is:

function_name <- function(argument1,argument2,...){
    Statments that do something...
    return(something to return from the function)
}

In this example we create a new function called “data_description” which calculates a mean and counts the number of records - these are stored within a new object called “stats” which is returned when the function is run.

# User defined function
data_description <- function(x) { 
  stats <- c(M = mean(x), S = length(x))
  return(stats)
  }

We can see how this works by simply supplying the function some data - in this case, all the prices:

data_description(amsterdam$price)
##          M          S 
##   132.8304 13837.0000

This returned the mean price, plus the length of the supplied string which is the same as the number of rows in the data frame - i.e. 13837. We can now use our function to create means for aggregations using a very helpful package called doBy which we will load now:

install.packages("doBy")
library(doBy)

Using the summaryBy() function we can now apply our data_description() function to a set of aggregations. We separate the price from the grouping variable using the “~” symbol. If you are wondering what an Earth House is, Wiki has an answer

  summaryBy(price ~ property_type, data = amsterdam, FUN = data_description )
##      property_type   price.M price.S
## 1        Apartment 127.61055   10954
## 2  Bed & Breakfast 108.74745     392
## 3             Boat 186.59353     433
## 4         Bungalow 118.66667       3
## 5            Cabin  90.52174      23
## 6        Camper/RV  63.69231      13
## 7           Chalet  93.33333       3
## 8      Condominium 124.59903     207
## 9             Dorm  52.33333       3
## 10     Earth House  90.00000       1
## 11           House 161.74914    1459
## 12             Hut  65.00000       1
## 13            Loft 134.39048     105
## 14           Other 145.42105      57
## 15            Tent  20.00000       1
## 16       Townhouse 157.85806     155
## 17           Villa 157.26923      26
## 18            Yurt  85.00000       1

Spatial summary measures

So far we have considered a pre-defined geography in the previous example: neighborhoods. However, a common spatial analysis task is to create descriptive statistics related to the context of features. This task will typically require a buffer to be created around a feature and then data aggregated and summarized within the buffers.

First we will import a spatial data file that relates to historic building locations:

#Load package
library(rgdal,verbose = FALSE)
## Loading required package: sp
## rgdal: version: 1.2-5, (SVN revision 648)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.1.2, released 2016/10/24
##  Path to GDAL shared files: /opt/local/share/gdal
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: (autodetected)
##  Linking to sp version: 1.2-4
historic = readOGR("./data/historic_buildings.geojson", "OGRGeoJSON",verbose = FALSE)

We will then create a spatial point data frame using Airbnb data:

# Create the SpatialPointsDataFrame
SP_amsterdam <- SpatialPointsDataFrame(coords = data.frame(amsterdam$longitude, amsterdam$latitude), data = amsterdam, proj4string = historic@proj4string)

We will now use the function gBuffer() which is found within the Rgeos package to create radial buffers around the historic building locations, however, before we can do this we must first alter the projection of both the spatial point data frames so that they have a unit in meters - currently these are projected as WGS84. A common projection in meters for the Netherlands is “Amersfoort / RD Old EPSG:28992”. We can convert both objects to this projection using the spTransform() function.

# Convert Airbnb
SP_amsterdam <- spTransform(SP_amsterdam, CRS( "+init=epsg:28992" ))

# Convert historic buildings
historic <- spTransform(historic, CRS( "+init=epsg:28992" ) ) 

# View Airbnb
plot(SP_amsterdam)

# View historic buildings
plot(historic)

Now that the projections are in a CRS with meters as the unit we can specify a sensible width for the gBuffer function - we will set this as 200 meters.

#Load package
library(rgeos,verbose = FALSE)
## rgeos version: 0.3-21, (SVN revision 540)
##  GEOS runtime version: 3.4.2-CAPI-1.8.2 r3921 
##  Linking to sp version: 1.2-3 
##  Polygon checking: TRUE
#Create buffer
historic_Buffers <- gBuffer(historic, width = 200, byid = TRUE)
# Show buffer
plot(historic_Buffers)

This function has created a SpatialPolygonsDataFrame with 200m buffers around the points. The data attributes of the new object are the same as the points:

# View top six rows of data
head(historic_Buffers@data)
##                                Naam Hoofdgroep Functie
## 1            Atheneum Illustre 1632   Openbaar        
## 2                    Oude Kerk 1306    Religie    Kerk
## 3 Windmolenzijdepoorthuis 1300-1380   Militair   Poort
## 4         Haarlemmerpoort 1425-1579   Militair   Poort
## 5         Bindwijkerpoort 1340-1425   Militair   Poort
## 6        Heiligewegspoort 1425-1610   Militair   Poort
##                       Foto Website                       Opmerking Begin
## 1    Atheneum Illustre.jpg         Bron: Gerrit de Broen 1774-1782  1632
## 2            Oude kerk.jpg           Bron: Cornelis Anthonisz 1544  1306
## 3                                                                   1300
## 4      Haarlemmerpoort.jpg           Bron: Cornelis Anthonisz 1544  1425
## 5                                                                   1340
## 6 Heiligewegspoort1544.jpg           Bron: Cornelis Anthonisz 1544  1425
##   Eind
## 1    0
## 2    0
## 3 1380
## 4 1579
## 5 1425
## 6 1610

As we illustrated in a previous practical (see 2. Data Manipulation in R) we can use point.in.poly() to identify which points lie within a polygon; however, in this example things are a little more complex and many of the polygons overlap; and thus a point can be in multiple polygons. As such we will use a slightly less automated technique. For this we will use the over() function, however, because we are interested in calculating some values for each buffer area, we first need to add an extra parameter - returnList=TRUE. This returns a list of data frames, where each element of the list is a separate data frame and refers to a buffer, and the values those Airbnb records that are within this zone. This is a little different from the lists you created previously that were just lists of character strings or numerics.

# Create point in polygon list
o <- over(historic_Buffers, SP_amsterdam, returnList = TRUE)
# View length of the list - this is the same length as the number of historic buildings / buffers
length(o)
## [1] 200

If we examine the object o, we will see also see that this comprises a list of data frames. The summary function tells you about an object - head, is used to wrap around the function so only the first six elements are shown:

head(summary(o))
##   Length Class      Mode
## 1 9      data.frame list
## 2 9      data.frame list
## 3 9      data.frame list
## 4 9      data.frame list
## 5 9      data.frame list
## 6 9      data.frame list
# View an item from the list (in this case, item 199)
o[[199]]
##           id neighbourhood_cleansed latitude longitude property_type
## 447  9338308              Oud-Noord 52.38540  4.901966     Apartment
## 533  7105462              Oud-Noord 52.38271  4.902881          Boat
## 535 12380395              Oud-Noord 52.38458  4.901996          Loft
## 546 10347840              Oud-Noord 52.38463  4.901039     Apartment
## 557   156815              Oud-Noord 52.38328  4.904111          Boat
## 580  7198961              Oud-Noord 52.38234  4.903504     Apartment
## 594  1069364              Oud-Noord 52.38231  4.903764     Apartment
## 659  9368708              Oud-Noord 52.38510  4.901042     Apartment
## 735 11235395              Oud-Noord 52.38494  4.902726     Apartment
## 761  9376537              Oud-Noord 52.38515  4.900746     Apartment
##           room_type bedrooms price number_of_reviews
## 447 Entire home/apt        3   149                 4
## 533 Entire home/apt        2    80                38
## 535    Private room        1    75                 2
## 546    Private room        1    96                20
## 557 Entire home/apt       10   596                10
## 580 Entire home/apt        1   110                 9
## 594 Entire home/apt        1   220                18
## 659    Private room        1    69                 7
## 735 Entire home/apt        1   136                 2
## 761 Entire home/apt        1   110                13

We will discuss plotting in more detail during a later practical, however, here we plot the results of the point in polygon:

We can now look at the results of the point in polygon analysis and calculate the characteristics within each buffer. The first stage is to use the lapply() function to apply a function across the list - the first function removes all columns within each of the data frames within the list, apart from those specified; and the second calculates the mean - note that we also use the unlist() function that creates a vector of the prices.

# Keep just the price
o_cut <- lapply(o, function(x) x[(names(x) = "price")])

#Show just the prices for 199th item in the list
o_cut[199]
## $`199`
##     price
## 447   149
## 533    80
## 535    75
## 546    96
## 557   596
## 580   110
## 594   220
## 659    69
## 735   136
## 761   110
#Create a list of the mean price within the buffer
average_buffer_price <- lapply(o_cut, function(x) mean(unlist(x)))

We will now convert this list to a data frame and then append this back onto the historic buffer locations:

# Create data frame
average_buffer_price <- data.frame(unlist(average_buffer_price))

# Update column names
colnames(average_buffer_price) <- "Av_price_200m"

# Append the buildings
historic_Buffers@data <- cbind(historic_Buffers@data,average_buffer_price)

# View the top six rows
head(historic@data)
##                                Naam Hoofdgroep Functie
## 1            Atheneum Illustre 1632   Openbaar        
## 2                    Oude Kerk 1306    Religie    Kerk
## 3 Windmolenzijdepoorthuis 1300-1380   Militair   Poort
## 4         Haarlemmerpoort 1425-1579   Militair   Poort
## 5         Bindwijkerpoort 1340-1425   Militair   Poort
## 6        Heiligewegspoort 1425-1610   Militair   Poort
##                       Foto Website                       Opmerking Begin
## 1    Atheneum Illustre.jpg         Bron: Gerrit de Broen 1774-1782  1632
## 2            Oude kerk.jpg           Bron: Cornelis Anthonisz 1544  1306
## 3                                                                   1300
## 4      Haarlemmerpoort.jpg           Bron: Cornelis Anthonisz 1544  1425
## 5                                                                   1340
## 6 Heiligewegspoort1544.jpg           Bron: Cornelis Anthonisz 1544  1425
##   Eind
## 1    0
## 2    0
## 3 1380
## 4 1579
## 5 1425
## 6 1610

The buffers are also shown on a map:

Further resources / training