|
| 1 | +# Ran this back in Nov 2020 to scrape data |
| 2 | + |
| 3 | +library(rvest) |
| 4 | +library(tidyverse) |
| 5 | + |
| 6 | +# pages ------------------------------------------------------------------------ |
| 7 | + |
| 8 | +page1 <- read_html("https://www.zillow.com/homes/recently_sold/?searchQueryState=%7B%22usersSearchTerm%22%3A%2227705%22%2C%22mapBounds%22%3A%7B%22west%22%3A-78.97046046561346%2C%22east%22%3A-78.91123729056463%2C%22south%22%3A35.96344578386982%2C%22north%22%3A36.00664470880978%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sch%22%3A%7B%22value%22%3Atrue%7D%2C%22rs%22%3A%7B%22value%22%3Atrue%7D%2C%22fsba%22%3A%7B%22value%22%3Afalse%7D%2C%22fsbo%22%3A%7B%22value%22%3Afalse%7D%2C%22nc%22%3A%7B%22value%22%3Afalse%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22auc%22%3A%7B%22value%22%3Afalse%7D%2C%22fore%22%3A%7B%22value%22%3Afalse%7D%2C%22pmf%22%3A%7B%22value%22%3Afalse%7D%2C%22pf%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A14%2C%22customRegionId%22%3A%22b954d5fb9aX1-CRvbfiseagcose_vreiz%22%2C%22pagination%22%3A%7B%7D%7D") |
| 9 | +page2 <- read_html("https://www.zillow.com/homes/recently_sold/2_p/?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A2%7D%2C%22usersSearchTerm%22%3A%2227705%22%2C%22mapBounds%22%3A%7B%22west%22%3A-78.97046046561346%2C%22east%22%3A-78.91123729056463%2C%22south%22%3A35.96344578386982%2C%22north%22%3A36.00664470880978%7D%2C%22mapZoom%22%3A14%2C%22customRegionId%22%3A%22b954d5fb9aX1-CRvbfiseagcose_vreiz%22%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22pmf%22%3A%7B%22value%22%3Afalse%7D%2C%22fore%22%3A%7B%22value%22%3Afalse%7D%2C%22sch%22%3A%7B%22value%22%3Atrue%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22auc%22%3A%7B%22value%22%3Afalse%7D%2C%22nc%22%3A%7B%22value%22%3Afalse%7D%2C%22rs%22%3A%7B%22value%22%3Atrue%7D%2C%22fsbo%22%3A%7B%22value%22%3Afalse%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22pf%22%3A%7B%22value%22%3Afalse%7D%2C%22fsba%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%7D") |
| 10 | +page3 <- read_html("https://www.zillow.com/homes/recently_sold/3_p/?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A3%7D%2C%22usersSearchTerm%22%3A%2227705%22%2C%22mapBounds%22%3A%7B%22west%22%3A-78.97046046561346%2C%22east%22%3A-78.91123729056463%2C%22south%22%3A35.96344578386982%2C%22north%22%3A36.00664470880978%7D%2C%22mapZoom%22%3A14%2C%22customRegionId%22%3A%22b954d5fb9aX1-CRvbfiseagcose_vreiz%22%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22pmf%22%3A%7B%22value%22%3Afalse%7D%2C%22fore%22%3A%7B%22value%22%3Afalse%7D%2C%22sch%22%3A%7B%22value%22%3Atrue%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22auc%22%3A%7B%22value%22%3Afalse%7D%2C%22nc%22%3A%7B%22value%22%3Afalse%7D%2C%22rs%22%3A%7B%22value%22%3Atrue%7D%2C%22fsbo%22%3A%7B%22value%22%3Afalse%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22pf%22%3A%7B%22value%22%3Afalse%7D%2C%22fsba%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%7D") |
| 11 | + |
| 12 | +# function: read_zillow -------------------------------------------------------- |
| 13 | + |
| 14 | +read_zillow <- function(page){ |
| 15 | + |
| 16 | + addresses <- page %>% |
| 17 | + html_nodes(".list-card-addr") %>% |
| 18 | + html_text() |
| 19 | + |
| 20 | + prices <- page %>% |
| 21 | + html_nodes(".list-card-price") %>% |
| 22 | + html_text() |
| 23 | + |
| 24 | + details <- page %>% |
| 25 | + html_nodes(".list-card-details") %>% |
| 26 | + html_text() |
| 27 | + |
| 28 | + links_double <- page %>% |
| 29 | + html_nodes(".list-card-link") %>% |
| 30 | + html_attr("href") |
| 31 | + |
| 32 | + links <- links_double[seq(1, length(links_double), 2)] |
| 33 | + |
| 34 | + tibble( |
| 35 | + address = addresses, |
| 36 | + price = prices, |
| 37 | + details = details, |
| 38 | + link = links |
| 39 | + ) |
| 40 | + |
| 41 | +} |
| 42 | + |
| 43 | +# all houses ------------------------------------------------------------------- |
| 44 | + |
| 45 | +df1 <- read_zillow(page1) |
| 46 | +df2 <- read_zillow(page2) |
| 47 | +df3 <- read_zillow(page3) |
| 48 | + |
| 49 | +df_raw <- bind_rows(df1, df2, df3) |
| 50 | + |
| 51 | +# clean data ------------------------------------------------------------------- |
| 52 | + |
| 53 | +df_raw <- df_raw %>% |
| 54 | + mutate( |
| 55 | + price = parse_number(price), |
| 56 | + price = if_else(price < 100, price * 1000000, price) |
| 57 | + ) %>% |
| 58 | + filter( |
| 59 | + !str_detect(details, "lot"), |
| 60 | + !str_detect(details, "-- bds-- ba-- sqft"), |
| 61 | + !str_detect(details, "Studio") |
| 62 | + ) %>% |
| 63 | + mutate( |
| 64 | + details = str_replace(details, "bds", "bds - "), |
| 65 | + details = str_replace(details, "ba", "ba - ") |
| 66 | + ) %>% |
| 67 | + separate(details, into = c("bed", "bath", "area"), sep = " - ") %>% |
| 68 | + mutate( |
| 69 | + bed = parse_number(bed), |
| 70 | + bath = parse_number(bath), |
| 71 | + area = parse_number(area) |
| 72 | + ) |
| 73 | + |
| 74 | +# function: get_features ------------------------------------------------------- |
| 75 | + |
| 76 | +get_features <- function(url){ |
| 77 | + |
| 78 | + page <- read_html(url) |
| 79 | + |
| 80 | + Sys.sleep(rpois(1, lambda = 1)) |
| 81 | + |
| 82 | + page %>% |
| 83 | + html_nodes(".ds-home-facts-and-features") %>% |
| 84 | + html_nodes(".ds-home-fact-list>li") %>% |
| 85 | + html_text() %>% |
| 86 | + strsplit(":") %>% |
| 87 | + map( |
| 88 | + ~ setNames(.x[2], .x[1]) |
| 89 | + ) %>% |
| 90 | + unlist() |
| 91 | + |
| 92 | +} |
| 93 | + |
| 94 | +# get features ----------------------------------------------------------------- |
| 95 | + |
| 96 | +df_with_features <- df_raw %>% |
| 97 | + mutate(features = map(link, get_features)) |
| 98 | + |
| 99 | +df <- df_with_features %>% |
| 100 | + unnest_wider(features) %>% |
| 101 | + janitor::clean_names() |
| 102 | + |
| 103 | +df <- df %>% |
| 104 | + mutate( |
| 105 | + lot = parse_number(lot), |
| 106 | + lot = if_else(lot > 100, round(lot / 43560, 2), lot) |
| 107 | + ) %>% |
| 108 | + filter(type == "Single Family") |
| 109 | + |
| 110 | +# write csv -------------------------------------------------------------------- |
| 111 | + |
| 112 | +write_csv(df, "duke-forest.csv") |
0 commit comments