Scott Chamberlain
2016-11-15
Data on the web can include
Yay! | Boo! | |
---|---|---|
Scraping | Mostly always an option | Code breaks easily |
Can be blocked easily | ||
Files/FTP | Likely an R thing to read it | Links to files can break |
Watch out for huge files | ||
Database dump | Great if you know SQL | Not so great if you don’t |
Flexible query syntax | ||
Avoid R memory lim. | ||
API | Interface is stable(ish) | Not great when want “all the data” |
HTTP is a good UI | HTTP APIs are not standarized |
Let’s scrape some weather data
rvest
is an easy way to get started
library('rvest')
url <- paste0(
"https://www.wunderground.com/history/airport/",
"KAMW/2016/11/15/DailyHistory.html"
)
weather_data <- read_html(url)
head(html_table(weather_data)[[1]])
#> Actual Average Record
#> 1 Temperature
#> 2 Mean Temperature 47 °F 37 °F
#> 3 Max Temperature 66 °F 46 °F 72 °F\n(2001)
#> 4 Min Temperature 28 °F 27 °F 14 °F\n(2014)
#> 5 Degree Days
#> 6 Heating Degree Days 18 28
What did we just do?
Go to View Page Source
by right-clicking on the page
The source is a pile of html. Very exciting!
3 minutes
Use SelectorGadget to find the xpath to an HTML element.
They look like this
crul
, curl
, httr
, RCurl
download.file()
read.table
and friendsExample FTP data collection
Download file
library("httr")
url <- "ftp://ftp.ncdc.noaa.gov/pub/data/noaa/1991/010510-99999-1991.gz"
res <- GET(url, write_disk(path = basename(url)))
res$request$output$path
OR
download.file(url, destfile = basename(url))
Then, read data
readLines(basename(url), n = 10)
#> [1] "0036010510999991991010112004+69583+023533FM-12+037499999V0203201N00101030001CN0300001N9-01651-01831999999ADDAG12000GF108991031071008001021999"
#> [2] "0080010510999991991010118004+69583+023533FM-12+037499999V0200301N00101012001CN0300001N9-01541-01711999999ADDAA106000091AG14000GF108991081001012501021999KA1120M+01201REMSYN011333 10120"
#> [3] "0109010510999991991010206004+69583+023533FM-12+037499999V0200901N00101220001CN0300001N9-01931-02121999999ADDAA106000091AG14000AJ100499199999999GF102991001001999999001081IA1239KA1120N-02461REMSYN017333 21246 47049"
#> [4] "0036010510999991991010212004+69583+023533FM-12+037499999V0201601N00101030001CN0200001N9-01441-01621999999ADDAG12000GF108991031071004501021999"
#> [5] "0118010510999991991010218004+69583+023533FM-12+037499999V0201601N00101002401CN0030001N9-01201-01341999999ADDAA199001091AG10002AY171061AY221061GA1091+999999999GF109991091999999999999999KA1120M-01201MW1731REMSYN011333 11120"
#> [6] "0131010510999991991010306004+69583+023533FM-12+037499999V0201601N00261030001CN0200001N9-01241-01461999999ADDAA199008091AG10001AJ100519199999999AY171061AY231061GF108991021071004501021999IA1239KA1120N-01241MW1021REMSYN017333 21124 47051"
#> [7] "0058010510999991991010312004+69583+023533FM-12+037499999V0201701N00461030001CN0200001N9-01331-01571999999ADDAG12001AY171061AY231061GF108991031071004501021999MW1711"
#> [8] "0102010510999991991010318004+69583+023533FM-12+037499999V0201701N00461030001CN0200001N9-01291-01551999999ADDAA199004091AG10001AY171061AY231061GF108991041071004501021999KA1120M-01261MW1021REMSYN011333 11126"
#> [9] "0131010510999991991010406004+69583+023533FM-12+037499999V0201601N00261030001CN0200001N9-01161-01431999999ADDAA106000091AG14000AJ100449199999999AY131061AY221061GF108991021071004501021999IA1239KA1120N-01351MW1021REMSYN017333 21135 47044"
#> [10] "0036010510999991991010412004+69583+023533FM-12+037499999V0201701N00261030001CN0200001N9-01161-01401999999ADDAG12000GF108991031071004501021999"
Use our package isdparser
library(isdparser)
isd_parse(path = basename(url))
#> # A tibble: 1,038 × 130
#> total_chars usaf_station wban_station date time date_flag
#> <dbl> <chr> <chr> <date> <chr> <chr>
#> 1 36 010510 99999 1991-01-01 1200 4
#> 2 80 010510 99999 1991-01-01 1800 4
#> 3 109 010510 99999 1991-01-02 0600 4
#> 4 36 010510 99999 1991-01-02 1200 4
#> 5 118 010510 99999 1991-01-02 1800 4
#> 6 131 010510 99999 1991-01-03 0600 4
#> 7 58 010510 99999 1991-01-03 1200 4
#> 8 102 010510 99999 1991-01-03 1800 4
#> 9 131 010510 99999 1991-01-04 0600 4
#> 10 36 010510 99999 1991-01-04 1200 4
#> # ... with 1,028 more rows, and 124 more variables: latitude <dbl>,
#> # longitude <dbl>, type_code <chr>, elevation <dbl>, call_letter <chr>,
#> # quality <chr>, wind_direction <dbl>, wind_direction_quality <chr>,
#> # wind_code <chr>, wind_speed <dbl>, wind_speed_quality <chr>,
#> # ceiling_height <chr>, ceiling_height_quality <chr>,
#> # ceiling_height_determination <chr>, ceiling_height_cavok <chr>,
#> # visibility_distance <chr>, visibility_distance_quality <chr>,
#> # visibility_code <chr>, visibility_code_quality <chr>,
#> # temperature <dbl>, temperature_quality <chr>,
#> # temperature_dewpoint <dbl>, temperature_dewpoint_quality <chr>,
#> # air_pressure <dbl>, air_pressure_quality <chr>,
#> # AG1_precipitation_estimated <chr>, AG1_discrepancy_code <chr>,
#> # AG1_estimated_water_depth <chr>, GF1_sky_condition <chr>,
#> # GF1_coverage <chr>, GF1_opaque_coverage <chr>,
#> # GF1_coverage_quality <chr>, GF1_lowest_cover <chr>,
#> # GF1_lowest_cover_quality <chr>, GF1_low_cloud_genus <chr>,
#> # GF1_low_cloud_genus_quality <chr>, GF1_lowest_cloud_base_height <chr>,
#> # GF1_lowest_cloud_base_height_quality <chr>, GF1_mid_cloud_genus <chr>,
#> # GF1_mid_cloud_genus_quality <chr>, GF1_high_cloud_genus <chr>,
#> # GF1_high_cloud_genus_quality <chr>, AA1_precipitation_liquid <chr>,
#> # AA1_period_quantity_hrs <chr>, AA1_depth <chr>,
#> # AA1_condition_quality <chr>, AA1_quality_code <chr>,
#> # N01_original_observation <chr>, N01_original_value_text <chr>,
#> # N01_units_code <chr>, N01_parameter_code <chr>,
#> # KA1_extreme_temp <chr>, KA1_period_quantity <chr>, KA1_max_min <chr>,
#> # KA1_temp <chr>, KA1_temp_quality <chr>, REM_remarks <chr>,
#> # REM_identifier <chr>, REM_length_quantity <chr>, REM_comment <chr>,
#> # AJ1_snow_depth <chr>, AJ1_depth_dimension <chr>,
#> # AJ1_condition_code <chr>, AJ1_quality_code <chr>,
#> # AJ1_equivalent_water_depth <chr>,
#> # AJ1_equivalent_water_condition_code <chr>,
#> # AJ1_equivalent_water_condition_quality_code <chr>,
#> # IA1_ground_surface <chr>, IA1_ground_surface_code <chr>,
#> # IA1_ground_surface_code_quality <chr>, AY1_past_weather_manual <chr>,
#> # AY1_condition_code <chr>, AY1_condition_quality <chr>,
#> # AY1_period <chr>, AY1_period_quality <chr>,
#> # AY2_past_weather_manual <chr>, AY2_condition_code <chr>,
#> # AY2_condition_quality <chr>, AY2_period <chr>,
#> # AY2_period_quality <chr>, GA1_sky_cover_layer_identifier <chr>,
#> # GA1_coverage_code <chr>, GA1_coverage_quality_code <chr>,
#> # GA1_base_height_dimension <chr>, GA1_base_height_quality_code <chr>,
#> # GA1_cloud_type_code <chr>, GA1_cloud_type_quality_code <chr>,
#> # MW1_first_weather_reported <chr>,
#> # MW1_manual_atmospheric_condition_code <chr>,
#> # MW1_condition_quality <chr>, KA2_extreme_temp <chr>,
#> # KA2_period_quantity <chr>, KA2_max_min <chr>, KA2_temp <chr>,
#> # KA2_temp_quality <chr>, N02_original_observation <chr>,
#> # N02_original_value_text <chr>, N02_units_code <chr>,
#> # N02_parameter_code <chr>, EQD_observation_identifier <chr>, ...
3 minutes
Web APIs are:
/search
- to search/<species id>
- to get data on a particular species by IDComponents
R clients for APIs
crul
- https://github.com/ropenscilabs/crulrequest
- https://github.com/sckott/requestcurl
- https://github.com/jeroenooms/curlhttr
- https://github.com/hadley/httrlibrary("request")
library("dplyr")
res <- api("http://omdbapi.com") %>%
api_query(s = "iron man", r = json)
res$Search
#> # A tibble: 10 × 5
#> Title Year imdbID Type
#> * <chr> <chr> <chr> <chr>
#> 1 Iron Man 2008 tt0371746 movie
#> 2 Iron Man 3 2013 tt1300854 movie
#> 3 Iron Man 2 2010 tt1228705 movie
#> 4 The Man in the Iron Mask 1998 tt0120744 movie
#> 5 The Man with the Iron Fists 2012 tt1258972 movie
#> 6 Tetsuo, the Iron Man 1989 tt0096251 movie
#> 7 The Invincible Iron Man 2007 tt0903135 movie
#> 8 Iron Man: Rise of Technovore 2013 tt2654124 movie
#> 9 The Man with the Iron Fists 2 2015 tt3625152 movie
#> 10 Man of Iron 1981 tt0082222 movie
#> # ... with 1 more variables: Poster <chr>
3 minutes
We connect you to open data
Check us out at https://ropensci.org
We’d love to help with data problems that fit in our wheelhouse.
Contribute! It’s a great way to learn more R.lots more …, check out https://ropensci.org/packages