Skip to content

Commit 92ffe74

Browse files
Add scraping code for provenance + sale price, not listing
1 parent 9e86a2b commit 92ffe74

File tree

3 files changed

+122
-10
lines changed

3 files changed

+122
-10
lines changed

R/data-duke_forest.R

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
#' Houses for sale in Duke Forest, Durham, NC in Nov 2020
1+
#' Sale prices of houses in Duke Forest, Durham, NC
22
#'
3-
#' Data on houses that were for sale in the Duke Forest neighborhood of
3+
#' Data on houses that were recently sold in the Duke Forest neighborhood of
44
#' Durham, NC in November 2020.
55
#'
66
#' @format A data frame with 98 rows and 13 variables.
77
#' \describe{
88
#' \item{address}{Address of house.}
9-
#' \item{price}{Listing price, in USD.}
9+
#' \item{price}{Sale price, in USD.}
1010
#' \item{bed}{Number of bedrooms.}
1111
#' \item{bath}{Number of bathrooms.}
1212
#' \item{area}{Area of home, in square feet.}
@@ -28,7 +28,7 @@
2828
#' geom_boxplot() +
2929
#' labs(
3030
#' x = "Number of bedrooms",
31-
#' y = "Listing price (USD)",
31+
#' y = "Sale price (USD)",
3232
#' title = "Homes for sale in Duke Forest, Durham, NC",
3333
#' subtitle = "Data are from November 2020"
3434
#' )
@@ -38,7 +38,7 @@
3838
#' geom_point() +
3939
#' labs(
4040
#' x = "Area (square feet)",
41-
#' y = "Listing price (USD)",
41+
#' y = "Sale price (USD)",
4242
#' title = "Homes for sale in Duke Forest, Durham, NC",
4343
#' subtitle = "Data are from November 2020"
4444
#' )

data-raw/duke_forest/zillow.R

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# Ran this back in Nov 2020 to scrape data
2+
3+
library(rvest)
4+
library(tidyverse)
5+
6+
# pages ------------------------------------------------------------------------
7+
8+
page1 <- read_html("https://www.zillow.com/homes/recently_sold/?searchQueryState=%7B%22usersSearchTerm%22%3A%2227705%22%2C%22mapBounds%22%3A%7B%22west%22%3A-78.97046046561346%2C%22east%22%3A-78.91123729056463%2C%22south%22%3A35.96344578386982%2C%22north%22%3A36.00664470880978%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sch%22%3A%7B%22value%22%3Atrue%7D%2C%22rs%22%3A%7B%22value%22%3Atrue%7D%2C%22fsba%22%3A%7B%22value%22%3Afalse%7D%2C%22fsbo%22%3A%7B%22value%22%3Afalse%7D%2C%22nc%22%3A%7B%22value%22%3Afalse%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22auc%22%3A%7B%22value%22%3Afalse%7D%2C%22fore%22%3A%7B%22value%22%3Afalse%7D%2C%22pmf%22%3A%7B%22value%22%3Afalse%7D%2C%22pf%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A14%2C%22customRegionId%22%3A%22b954d5fb9aX1-CRvbfiseagcose_vreiz%22%2C%22pagination%22%3A%7B%7D%7D")
9+
page2 <- read_html("https://www.zillow.com/homes/recently_sold/2_p/?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A2%7D%2C%22usersSearchTerm%22%3A%2227705%22%2C%22mapBounds%22%3A%7B%22west%22%3A-78.97046046561346%2C%22east%22%3A-78.91123729056463%2C%22south%22%3A35.96344578386982%2C%22north%22%3A36.00664470880978%7D%2C%22mapZoom%22%3A14%2C%22customRegionId%22%3A%22b954d5fb9aX1-CRvbfiseagcose_vreiz%22%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22pmf%22%3A%7B%22value%22%3Afalse%7D%2C%22fore%22%3A%7B%22value%22%3Afalse%7D%2C%22sch%22%3A%7B%22value%22%3Atrue%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22auc%22%3A%7B%22value%22%3Afalse%7D%2C%22nc%22%3A%7B%22value%22%3Afalse%7D%2C%22rs%22%3A%7B%22value%22%3Atrue%7D%2C%22fsbo%22%3A%7B%22value%22%3Afalse%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22pf%22%3A%7B%22value%22%3Afalse%7D%2C%22fsba%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%7D")
10+
page3 <- read_html("https://www.zillow.com/homes/recently_sold/3_p/?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A3%7D%2C%22usersSearchTerm%22%3A%2227705%22%2C%22mapBounds%22%3A%7B%22west%22%3A-78.97046046561346%2C%22east%22%3A-78.91123729056463%2C%22south%22%3A35.96344578386982%2C%22north%22%3A36.00664470880978%7D%2C%22mapZoom%22%3A14%2C%22customRegionId%22%3A%22b954d5fb9aX1-CRvbfiseagcose_vreiz%22%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22pmf%22%3A%7B%22value%22%3Afalse%7D%2C%22fore%22%3A%7B%22value%22%3Afalse%7D%2C%22sch%22%3A%7B%22value%22%3Atrue%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22auc%22%3A%7B%22value%22%3Afalse%7D%2C%22nc%22%3A%7B%22value%22%3Afalse%7D%2C%22rs%22%3A%7B%22value%22%3Atrue%7D%2C%22fsbo%22%3A%7B%22value%22%3Afalse%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22pf%22%3A%7B%22value%22%3Afalse%7D%2C%22fsba%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%7D")
11+
12+
# function: read_zillow --------------------------------------------------------
13+
14+
read_zillow <- function(page){
15+
16+
addresses <- page %>%
17+
html_nodes(".list-card-addr") %>%
18+
html_text()
19+
20+
prices <- page %>%
21+
html_nodes(".list-card-price") %>%
22+
html_text()
23+
24+
details <- page %>%
25+
html_nodes(".list-card-details") %>%
26+
html_text()
27+
28+
links_double <- page %>%
29+
html_nodes(".list-card-link") %>%
30+
html_attr("href")
31+
32+
links <- links_double[seq(1, length(links_double), 2)]
33+
34+
tibble(
35+
address = addresses,
36+
price = prices,
37+
details = details,
38+
link = links
39+
)
40+
41+
}
42+
43+
# all houses -------------------------------------------------------------------
44+
45+
df1 <- read_zillow(page1)
46+
df2 <- read_zillow(page2)
47+
df3 <- read_zillow(page3)
48+
49+
df_raw <- bind_rows(df1, df2, df3)
50+
51+
# clean data -------------------------------------------------------------------
52+
53+
df_raw <- df_raw %>%
54+
mutate(
55+
price = parse_number(price),
56+
price = if_else(price < 100, price * 1000000, price)
57+
) %>%
58+
filter(
59+
!str_detect(details, "lot"),
60+
!str_detect(details, "-- bds-- ba-- sqft"),
61+
!str_detect(details, "Studio")
62+
) %>%
63+
mutate(
64+
details = str_replace(details, "bds", "bds - "),
65+
details = str_replace(details, "ba", "ba - ")
66+
) %>%
67+
separate(details, into = c("bed", "bath", "area"), sep = " - ") %>%
68+
mutate(
69+
bed = parse_number(bed),
70+
bath = parse_number(bath),
71+
area = parse_number(area)
72+
)
73+
74+
# function: get_features -------------------------------------------------------
75+
76+
get_features <- function(url){
77+
78+
page <- read_html(url)
79+
80+
Sys.sleep(rpois(1, lambda = 1))
81+
82+
page %>%
83+
html_nodes(".ds-home-facts-and-features") %>%
84+
html_nodes(".ds-home-fact-list>li") %>%
85+
html_text() %>%
86+
strsplit(":") %>%
87+
map(
88+
~ setNames(.x[2], .x[1])
89+
) %>%
90+
unlist()
91+
92+
}
93+
94+
# get features -----------------------------------------------------------------
95+
96+
df_with_features <- df_raw %>%
97+
mutate(features = map(link, get_features))
98+
99+
df <- df_with_features %>%
100+
unnest_wider(features) %>%
101+
janitor::clean_names()
102+
103+
df <- df %>%
104+
mutate(
105+
lot = parse_number(lot),
106+
lot = if_else(lot > 100, round(lot / 43560, 2), lot)
107+
) %>%
108+
filter(type == "Single Family")
109+
110+
# write csv --------------------------------------------------------------------
111+
112+
write_csv(df, "duke-forest.csv")

man/duke_forest.Rd

Lines changed: 5 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)