Load packages
library(rvest)
library(ggvis)
library(tidyverse)
Demonstration
Use rvest to scrape tripadvisor.com
Inspired by
Star Wars
Goal: Scrape the first table on the Carrie Fisher Wikipedia page
CarrieFisherWiki <- "https://en.wikipedia.org/wiki/Carrie_Fisher"
CarrieFisherWiki %>%
read_html() %>%
html_node(".wikitable") %>%
html_table()
The Durham Hotel
html_nodes
Read the HTML for The Durham Hotel review on TripAdvisor
url <- "https://www.tripadvisor.com/Hotel_Review-g49092-d8470160-Reviews-The_Durham_Hotel-Durham_North_Carolina.html"
reviews <- url %>%
read_html() %>%
html_nodes("#REVIEWS .innerBubble")
html_attr
id <- reviews %>%
html_node(".quote a") %>%
html_attr("id")
html_text
quote <- reviews %>%
html_node(".quote span") %>%
html_text()
review <- reviews %>%
html_node(".entry .partial_entry") %>%
html_text()
Variations
Post processing
# The rating variable is listed as an httl attribute. Convert to a number and divide by 10.
rating <- reviews %>%
html_node(".rating .ui_bubble_rating") %>%
html_attrs() %>%
gsub("ui_bubble_rating bubble_", "", .) %>%
as.integer() / 10
# Convert Time
date <- reviews %>%
html_node(".rating .ratingDate") %>%
html_attr("title") %>%
strptime("%b %d, %Y") %>%
as.POSIXct()
Manipulate Data
Preliminary Analysis in a Tibble (modern Data Frame)
DataFrame (Tibble)
dhotel <- data_frame(id, quote, rating, date, review)
dhotel
Summary Statistics
summary(dhotel$rating)
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.00 3.25 4.00 4.10 5.00 5.00
Frequency Bar Chart
dhotel %>%
group_by(rating) %>%
mutate(stars = factor(rating, levels = c(3, 4, 5), labels = c("Three", "Four", "Five"))) %>%
ggvis(~stars) %>%
layer_bars()
Append Variables
Gather location and screen-name information. Select only reviews with a location value
# The location information is in a different column than the review information. It is identified by a different CSS class: .col1of2
memInfo <- url %>%
read_html() %>%
html_nodes("#REVIEWS .col1of2")
# Identify the Location
loc1 <- memInfo %>%
html_node(".location") %>%
html_text()
# Identify the Screen Name
screenName <- memInfo %>%
html_node(".scrname") %>%
html_text()
# Append the values to the original data frame, dhotel
dhotel$scrName <- screenName
dhotel$location <- loc1
# Select variables to display
dhotel %>%
select(scrName, location, rating, quote) %>%
filter(!is.na(location))
LS0tDQp0aXRsZTogInJ2ZXN0IGRlbW8iDQphdXRob3I6ICJKb2huIExpdHRsZSINCmRhdGU6ICJgciBTeXMuRGF0ZSgpYCINCm91dHB1dDogDQogIGh0bWxfbm90ZWJvb2s6DQogICAgdG9jOiB5ZXMNCiAgICB0b2NfZmxvYXQ6IHllcw0KLS0tDQoNCiMjIExvYWQgcGFja2FnZXMNCmBgYHtyLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KbGlicmFyeShydmVzdCkNCmxpYnJhcnkoZ2d2aXMpDQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmBgYA0KDQoNCiMjIERlbW9uc3RyYXRpb24NCg0KVXNlIHJ2ZXN0IHRvIHNjcmFwZSB0cmlwYWR2aXNvci5jb20NCg0KSW5zcGlyZWQgYnkNCg0KLSBodHRwOi8vbm90ZXNvZmRhYmJsZXIuZ2l0aHViLmlvLzIwMTQwOF9ob3RlbFJldmlldy9zY3JhcGVUcmlwQWR2aXNvci5odG1sDQotIGBkZW1vKHRyaXBhZHZpc29yLCBwYWNrYWdlID0gInJ2ZXN0IilgDQoNCi0tLQ0KDQojIyBTdGFyIFdhcnMNCg0KR29hbDogIFNjcmFwZSB0aGUgZmlyc3QgdGFibGUgb24gdGhlIFtDYXJyaWUgRmlzaGVyIFdpa2lwZWRpYSBwYWdlXShodHRwczovL2VuLndpa2lwZWRpYS5vcmcvd2lraS9DYXJyaWVfRmlzaGVyKQ0KDQpgYGB7cn0NCkNhcnJpZUZpc2hlcldpa2kgPC0gImh0dHBzOi8vZW4ud2lraXBlZGlhLm9yZy93aWtpL0NhcnJpZV9GaXNoZXIiDQoNCkNhcnJpZUZpc2hlcldpa2kgJT4lIA0KICByZWFkX2h0bWwoKSAlPiUgDQogIGh0bWxfbm9kZSgiLndpa2l0YWJsZSIpICU+JSANCiAgaHRtbF90YWJsZSgpDQoNCmBgYA0KDQojIyBUaGUgRHVyaGFtIEhvdGVsDQoNCiMjIyBodG1sX25vZGVzDQoNClJlYWQgdGhlIEhUTUwgZm9yIFtfVGhlIER1cmhhbSBIb3RlbF8gcmV2aWV3IG9uIFRyaXBBZHZpc29yXShodHRwczovL3d3dy50cmlwYWR2aXNvci5jb20vSG90ZWxfUmV2aWV3LWc0OTA5Mi1kODQ3MDE2MC1SZXZpZXdzLVRoZV9EdXJoYW1fSG90ZWwtRHVyaGFtX05vcnRoX0Nhcm9saW5hLmh0bWwpDQpgYGB7cn0NCnVybCA8LSAiaHR0cHM6Ly93d3cudHJpcGFkdmlzb3IuY29tL0hvdGVsX1Jldmlldy1nNDkwOTItZDg0NzAxNjAtUmV2aWV3cy1UaGVfRHVyaGFtX0hvdGVsLUR1cmhhbV9Ob3J0aF9DYXJvbGluYS5odG1sIg0KDQpyZXZpZXdzIDwtIHVybCAlPiUNCiAgcmVhZF9odG1sKCkgJT4lDQogIGh0bWxfbm9kZXMoIiNSRVZJRVdTIC5pbm5lckJ1YmJsZSIpDQpgYGANCg0KLS0tDQoNCiMjIyBodG1sX2F0dHINCmBgYHtyfQ0KaWQgPC0gcmV2aWV3cyAlPiUNCiAgaHRtbF9ub2RlKCIucXVvdGUgYSIpICU+JQ0KICBodG1sX2F0dHIoImlkIikNCmBgYA0KDQotLS0NCg0KIyMjIGh0bWxfdGV4dA0KYGBge3J9DQpxdW90ZSA8LSByZXZpZXdzICU+JQ0KICBodG1sX25vZGUoIi5xdW90ZSBzcGFuIikgJT4lDQogIGh0bWxfdGV4dCgpDQoNCnJldmlldyA8LSByZXZpZXdzICU+JQ0KICBodG1sX25vZGUoIi5lbnRyeSAucGFydGlhbF9lbnRyeSIpICU+JQ0KICBodG1sX3RleHQoKQ0KYGBgDQoNCi0tLQ0KDQojIyMgVmFyaWF0aW9ucyANCg0KUG9zdCBwcm9jZXNzaW5nDQpgYGB7cn0NCg0KIyBUaGUgcmF0aW5nIHZhcmlhYmxlIGlzIGxpc3RlZCBhcyBhbiBodHRsIGF0dHJpYnV0ZS4gIENvbnZlcnQgdG8gYSBudW1iZXIgYW5kIGRpdmlkZSBieSAxMC4NCnJhdGluZyA8LSByZXZpZXdzICU+JQ0KICBodG1sX25vZGUoIi5yYXRpbmcgLnVpX2J1YmJsZV9yYXRpbmciKSAlPiUNCiAgaHRtbF9hdHRycygpICU+JSANCiAgZ3N1YigidWlfYnViYmxlX3JhdGluZyBidWJibGVfIiwgIiIsIC4pICU+JQ0KICBhcy5pbnRlZ2VyKCkgLyAxMA0KDQojIENvbnZlcnQgVGltZQ0KZGF0ZSA8LSByZXZpZXdzICU+JQ0KICBodG1sX25vZGUoIi5yYXRpbmcgLnJhdGluZ0RhdGUiKSAlPiUNCiAgaHRtbF9hdHRyKCJ0aXRsZSIpICU+JQ0KICBzdHJwdGltZSgiJWIgJWQsICVZIikgJT4lDQogIGFzLlBPU0lYY3QoKQ0KYGBgDQoNCi0tLQ0KDQojIyBNYW5pcHVsYXRlIERhdGENCg0KUHJlbGltaW5hcnkgQW5hbHlzaXMgaW4gYSBUaWJibGUgKG1vZGVybiBEYXRhIEZyYW1lKQ0KDQojIyMgRGF0YUZyYW1lIChUaWJibGUpDQpgYGB7cn0NCmRob3RlbCA8LSBkYXRhX2ZyYW1lKGlkLCBxdW90ZSwgcmF0aW5nLCBkYXRlLCByZXZpZXcpDQpkaG90ZWwNCmBgYA0KDQotLS0NCg0KIyMjIFN1bW1hcnkgU3RhdGlzdGljcw0KYGBge3J9DQpzdW1tYXJ5KGRob3RlbCRyYXRpbmcpDQpgYGANCg0KLS0tDQoNCiMjIyBGcmVxdWVuY3kgQmFyIENoYXJ0DQpgYGB7cn0NCmRob3RlbCAlPiUgDQogIGdyb3VwX2J5KHJhdGluZykgJT4lIA0KICBtdXRhdGUoc3RhcnMgPSBmYWN0b3IocmF0aW5nLCBsZXZlbHMgPSBjKDMsIDQsIDUpLCBsYWJlbHMgPSBjKCJUaHJlZSIsICJGb3VyIiwgIkZpdmUiKSkpICU+JSANCiAgZ2d2aXMofnN0YXJzKSAlPiUgDQogIGxheWVyX2JhcnMoKQ0KYGBgDQoNCg0KLS0tDQoNCiMjIyBBcHBlbmQgVmFyaWFibGVzDQoNCkdhdGhlciBsb2NhdGlvbiBhbmQgc2NyZWVuLW5hbWUgaW5mb3JtYXRpb24uIFNlbGVjdCBvbmx5IHJldmlld3Mgd2l0aCBhIGxvY2F0aW9uIHZhbHVlDQpgYGB7cn0NCg0KIyBUaGUgbG9jYXRpb24gaW5mb3JtYXRpb24gaXMgaW4gYSBkaWZmZXJlbnQgY29sdW1uIHRoYW4gdGhlIHJldmlldyBpbmZvcm1hdGlvbi4gIEl0IGlzIGlkZW50aWZpZWQgYnkgYSBkaWZmZXJlbnQgQ1NTIGNsYXNzOiAgLmNvbDFvZjINCm1lbUluZm8gPC0gdXJsICU+JQ0KICByZWFkX2h0bWwoKSAlPiUgDQogIGh0bWxfbm9kZXMoIiNSRVZJRVdTIC5jb2wxb2YyIikNCg0KIyBJZGVudGlmeSB0aGUgTG9jYXRpb24NCmxvYzEgPC0gbWVtSW5mbyAlPiUgDQogIGh0bWxfbm9kZSgiLmxvY2F0aW9uIikgJT4lDQogIGh0bWxfdGV4dCgpDQoNCiMgSWRlbnRpZnkgdGhlIFNjcmVlbiBOYW1lDQpzY3JlZW5OYW1lIDwtIG1lbUluZm8gJT4lDQogIGh0bWxfbm9kZSgiLnNjcm5hbWUiKSAlPiUgDQogIGh0bWxfdGV4dCgpDQoNCiMgQXBwZW5kIHRoZSB2YWx1ZXMgdG8gdGhlIG9yaWdpbmFsIGRhdGEgZnJhbWUsIGRob3RlbA0KZGhvdGVsJHNjck5hbWUgPC0gc2NyZWVuTmFtZQ0KZGhvdGVsJGxvY2F0aW9uIDwtIGxvYzENCg0KIyBTZWxlY3QgdmFyaWFibGVzIHRvIGRpc3BsYXkNCmRob3RlbCAlPiUgDQogIHNlbGVjdChzY3JOYW1lLCBsb2NhdGlvbiwgcmF0aW5nLCBxdW90ZSkgJT4lIA0KICBmaWx0ZXIoIWlzLm5hKGxvY2F0aW9uKSkNCmBgYA0KDQoNCg0K