Load packages

library(rvest)
library(ggvis)
library(tidyverse)

Demonstration

Use rvest to scrape tripadvisor.com

Inspired by


Star Wars

Goal: Scrape the first table on the Carrie Fisher Wikipedia page

CarrieFisherWiki <- "https://en.wikipedia.org/wiki/Carrie_Fisher"
CarrieFisherWiki %>% 
  read_html() %>% 
  html_node(".wikitable") %>% 
  html_table()

The Durham Hotel

html_nodes

Read the HTML for The Durham Hotel review on TripAdvisor

url <- "https://www.tripadvisor.com/Hotel_Review-g49092-d8470160-Reviews-The_Durham_Hotel-Durham_North_Carolina.html"
reviews <- url %>%
  read_html() %>%
  html_nodes("#REVIEWS .innerBubble")

html_attr

id <- reviews %>%
  html_node(".quote a") %>%
  html_attr("id")

html_text

quote <- reviews %>%
  html_node(".quote span") %>%
  html_text()
review <- reviews %>%
  html_node(".entry .partial_entry") %>%
  html_text()

Variations

Post processing

# The rating variable is listed as an httl attribute.  Convert to a number and divide by 10.
rating <- reviews %>%
  html_node(".rating .ui_bubble_rating") %>%
  html_attrs() %>% 
  gsub("ui_bubble_rating bubble_", "", .) %>%
  as.integer() / 10
# Convert Time
date <- reviews %>%
  html_node(".rating .ratingDate") %>%
  html_attr("title") %>%
  strptime("%b %d, %Y") %>%
  as.POSIXct()

Manipulate Data

Preliminary Analysis in a Tibble (modern Data Frame)

DataFrame (Tibble)

dhotel <- data_frame(id, quote, rating, date, review)
dhotel

Summary Statistics

summary(dhotel$rating)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   3.00    3.25    4.00    4.10    5.00    5.00 

Frequency Bar Chart

dhotel %>% 
  group_by(rating) %>% 
  mutate(stars = factor(rating, levels = c(3, 4, 5), labels = c("Three", "Four", "Five"))) %>% 
  ggvis(~stars) %>% 
  layer_bars()

Append Variables

Gather location and screen-name information. Select only reviews with a location value

# The location information is in a different column than the review information.  It is identified by a different CSS class:  .col1of2
memInfo <- url %>%
  read_html() %>% 
  html_nodes("#REVIEWS .col1of2")
# Identify the Location
loc1 <- memInfo %>% 
  html_node(".location") %>%
  html_text()
# Identify the Screen Name
screenName <- memInfo %>%
  html_node(".scrname") %>% 
  html_text()
# Append the values to the original data frame, dhotel
dhotel$scrName <- screenName
dhotel$location <- loc1
# Select variables to display
dhotel %>% 
  select(scrName, location, rating, quote) %>% 
  filter(!is.na(location))
LS0tDQp0aXRsZTogInJ2ZXN0IGRlbW8iDQphdXRob3I6ICJKb2huIExpdHRsZSINCmRhdGU6ICJgciBTeXMuRGF0ZSgpYCINCm91dHB1dDogDQogIGh0bWxfbm90ZWJvb2s6DQogICAgdG9jOiB5ZXMNCiAgICB0b2NfZmxvYXQ6IHllcw0KLS0tDQoNCiMjIExvYWQgcGFja2FnZXMNCmBgYHtyLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShydmVzdCkNCmxpYnJhcnkoZ2d2aXMpDQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmBgYA0KDQoNCiMjIERlbW9uc3RyYXRpb24NCg0KVXNlIHJ2ZXN0IHRvIHNjcmFwZSB0cmlwYWR2aXNvci5jb20NCg0KSW5zcGlyZWQgYnkNCg0KLSBodHRwOi8vbm90ZXNvZmRhYmJsZXIuZ2l0aHViLmlvLzIwMTQwOF9ob3RlbFJldmlldy9zY3JhcGVUcmlwQWR2aXNvci5odG1sDQotIGBkZW1vKHRyaXBhZHZpc29yLCBwYWNrYWdlID0gInJ2ZXN0IilgDQoNCi0tLQ0KDQojIyBTdGFyIFdhcnMNCg0KR29hbDogIFNjcmFwZSB0aGUgZmlyc3QgdGFibGUgb24gdGhlIFtDYXJyaWUgRmlzaGVyIFdpa2lwZWRpYSBwYWdlXShodHRwczovL2VuLndpa2lwZWRpYS5vcmcvd2lraS9DYXJyaWVfRmlzaGVyKQ0KDQpgYGB7cn0NCkNhcnJpZUZpc2hlcldpa2kgPC0gImh0dHBzOi8vZW4ud2lraXBlZGlhLm9yZy93aWtpL0NhcnJpZV9GaXNoZXIiDQoNCkNhcnJpZUZpc2hlcldpa2kgJT4lIA0KICByZWFkX2h0bWwoKSAlPiUgDQogIGh0bWxfbm9kZSgiLndpa2l0YWJsZSIpICU+JSANCiAgaHRtbF90YWJsZSgpDQoNCmBgYA0KDQojIyBUaGUgRHVyaGFtIEhvdGVsDQoNCiMjIyBodG1sX25vZGVzDQoNClJlYWQgdGhlIEhUTUwgZm9yIFtfVGhlIER1cmhhbSBIb3RlbF8gcmV2aWV3IG9uIFRyaXBBZHZpc29yXShodHRwczovL3d3dy50cmlwYWR2aXNvci5jb20vSG90ZWxfUmV2aWV3LWc0OTA5Mi1kODQ3MDE2MC1SZXZpZXdzLVRoZV9EdXJoYW1fSG90ZWwtRHVyaGFtX05vcnRoX0Nhcm9saW5hLmh0bWwpDQpgYGB7cn0NCnVybCA8LSAiaHR0cHM6Ly93d3cudHJpcGFkdmlzb3IuY29tL0hvdGVsX1Jldmlldy1nNDkwOTItZDg0NzAxNjAtUmV2aWV3cy1UaGVfRHVyaGFtX0hvdGVsLUR1cmhhbV9Ob3J0aF9DYXJvbGluYS5odG1sIg0KDQpyZXZpZXdzIDwtIHVybCAlPiUNCiAgcmVhZF9odG1sKCkgJT4lDQogIGh0bWxfbm9kZXMoIiNSRVZJRVdTIC5pbm5lckJ1YmJsZSIpDQpgYGANCg0KLS0tDQoNCiMjIyBodG1sX2F0dHINCmBgYHtyfQ0KaWQgPC0gcmV2aWV3cyAlPiUNCiAgaHRtbF9ub2RlKCIucXVvdGUgYSIpICU+JQ0KICBodG1sX2F0dHIoImlkIikNCmBgYA0KDQotLS0NCg0KIyMjIGh0bWxfdGV4dA0KYGBge3J9DQpxdW90ZSA8LSByZXZpZXdzICU+JQ0KICBodG1sX25vZGUoIi5xdW90ZSBzcGFuIikgJT4lDQogIGh0bWxfdGV4dCgpDQoNCnJldmlldyA8LSByZXZpZXdzICU+JQ0KICBodG1sX25vZGUoIi5lbnRyeSAucGFydGlhbF9lbnRyeSIpICU+JQ0KICBodG1sX3RleHQoKQ0KYGBgDQoNCi0tLQ0KDQojIyMgVmFyaWF0aW9ucyANCg0KUG9zdCBwcm9jZXNzaW5nDQpgYGB7cn0NCg0KIyBUaGUgcmF0aW5nIHZhcmlhYmxlIGlzIGxpc3RlZCBhcyBhbiBodHRsIGF0dHJpYnV0ZS4gIENvbnZlcnQgdG8gYSBudW1iZXIgYW5kIGRpdmlkZSBieSAxMC4NCnJhdGluZyA8LSByZXZpZXdzICU+JQ0KICBodG1sX25vZGUoIi5yYXRpbmcgLnVpX2J1YmJsZV9yYXRpbmciKSAlPiUNCiAgaHRtbF9hdHRycygpICU+JSANCiAgZ3N1YigidWlfYnViYmxlX3JhdGluZyBidWJibGVfIiwgIiIsIC4pICU+JQ0KICBhcy5pbnRlZ2VyKCkgLyAxMA0KDQojIENvbnZlcnQgVGltZQ0KZGF0ZSA8LSByZXZpZXdzICU+JQ0KICBodG1sX25vZGUoIi5yYXRpbmcgLnJhdGluZ0RhdGUiKSAlPiUNCiAgaHRtbF9hdHRyKCJ0aXRsZSIpICU+JQ0KICBzdHJwdGltZSgiJWIgJWQsICVZIikgJT4lDQogIGFzLlBPU0lYY3QoKQ0KYGBgDQoNCi0tLQ0KDQojIyBNYW5pcHVsYXRlIERhdGENCg0KUHJlbGltaW5hcnkgQW5hbHlzaXMgaW4gYSBUaWJibGUgKG1vZGVybiBEYXRhIEZyYW1lKQ0KDQojIyMgRGF0YUZyYW1lIChUaWJibGUpDQpgYGB7cn0NCmRob3RlbCA8LSBkYXRhX2ZyYW1lKGlkLCBxdW90ZSwgcmF0aW5nLCBkYXRlLCByZXZpZXcpDQpkaG90ZWwNCmBgYA0KDQotLS0NCg0KIyMjIFN1bW1hcnkgU3RhdGlzdGljcw0KYGBge3J9DQpzdW1tYXJ5KGRob3RlbCRyYXRpbmcpDQpgYGANCg0KLS0tDQoNCiMjIyBGcmVxdWVuY3kgQmFyIENoYXJ0DQpgYGB7cn0NCmRob3RlbCAlPiUgDQogIGdyb3VwX2J5KHJhdGluZykgJT4lIA0KICBtdXRhdGUoc3RhcnMgPSBmYWN0b3IocmF0aW5nLCBsZXZlbHMgPSBjKDMsIDQsIDUpLCBsYWJlbHMgPSBjKCJUaHJlZSIsICJGb3VyIiwgIkZpdmUiKSkpICU+JSANCiAgZ2d2aXMofnN0YXJzKSAlPiUgDQogIGxheWVyX2JhcnMoKQ0KYGBgDQoNCg0KLS0tDQoNCiMjIyBBcHBlbmQgVmFyaWFibGVzDQoNCkdhdGhlciBsb2NhdGlvbiBhbmQgc2NyZWVuLW5hbWUgaW5mb3JtYXRpb24uIFNlbGVjdCBvbmx5IHJldmlld3Mgd2l0aCBhIGxvY2F0aW9uIHZhbHVlDQpgYGB7cn0NCg0KIyBUaGUgbG9jYXRpb24gaW5mb3JtYXRpb24gaXMgaW4gYSBkaWZmZXJlbnQgY29sdW1uIHRoYW4gdGhlIHJldmlldyBpbmZvcm1hdGlvbi4gIEl0IGlzIGlkZW50aWZpZWQgYnkgYSBkaWZmZXJlbnQgQ1NTIGNsYXNzOiAgLmNvbDFvZjINCm1lbUluZm8gPC0gdXJsICU+JQ0KICByZWFkX2h0bWwoKSAlPiUgDQogIGh0bWxfbm9kZXMoIiNSRVZJRVdTIC5jb2wxb2YyIikNCg0KIyBJZGVudGlmeSB0aGUgTG9jYXRpb24NCmxvYzEgPC0gbWVtSW5mbyAlPiUgDQogIGh0bWxfbm9kZSgiLmxvY2F0aW9uIikgJT4lDQogIGh0bWxfdGV4dCgpDQoNCiMgSWRlbnRpZnkgdGhlIFNjcmVlbiBOYW1lDQpzY3JlZW5OYW1lIDwtIG1lbUluZm8gJT4lDQogIGh0bWxfbm9kZSgiLnNjcm5hbWUiKSAlPiUgDQogIGh0bWxfdGV4dCgpDQoNCiMgQXBwZW5kIHRoZSB2YWx1ZXMgdG8gdGhlIG9yaWdpbmFsIGRhdGEgZnJhbWUsIGRob3RlbA0KZGhvdGVsJHNjck5hbWUgPC0gc2NyZWVuTmFtZQ0KZGhvdGVsJGxvY2F0aW9uIDwtIGxvYzENCg0KIyBTZWxlY3QgdmFyaWFibGVzIHRvIGRpc3BsYXkNCmRob3RlbCAlPiUgDQogIHNlbGVjdChzY3JOYW1lLCBsb2NhdGlvbiwgcmF0aW5nLCBxdW90ZSkgJT4lIA0KICBmaWx0ZXIoIWlzLm5hKGxvY2F0aW9uKSkNCmBgYA0KDQoNCg0K