Word - TerpConnect

advertisement
Data Collection Code
1.
Zillow Scraper
Part 1:
# Zoomed in on each area, then used that URL to scrape data
# Zillow shows 25 Houses per page and number of pages depends on the Seattle
Area you select.
# Usually there are 25 pages however, in the URL gives valid results for
i=40, so we used 40 to get as much data as possible and then we removed
duplicates
# Area1 Sold ----We repeated this for 12 different areas to get houses from
every section of Seattle
# install.packages("XML")
# install.packages("RCurl")
# import library/ defining parameters
library(XML)
library(RCurl)
# initialize variables to an empty vectors
streetdata<-vector()
pricedata<-vector()
zestdata<-vector()
addressLocalitydata<-vector()
postalCodedata<-vector()
addressRegiondata<-vector()
propertydetailsdata<-vector()
latitudedata<-vector()
longitudedata<-vector()
homeTypedata<-vector()
lotdata<-vector()
yeardata<-vector()
price_sqftdata<-vector()
propertyaddressdata<-vector()
# Number of pages you want to scrape
x=40
# Iteration to get the URL name and then scrape the data from each page.
for(i in 1:x)
{
#URL for all other pages expect the 1st
if(i!=1)
{
tempurl=c("http://www.zillow.com/homes/recently_sold/Seattle-WA/
16037_rid/47.780924,-122.268648,47.716634,122.396536_rect/12_zm/",i,"_p/") #Custom URL for the particular section of
Seattle we considered
url=paste(tempurl,sep='',collapse='') #Create the URL, convert parts of
the list(tempurl) into one form
}
# URL for the 1st page is different
else
{
url="http://www.zillow.com/homes/recently_sold/Seattle-WA/
16037_rid/47.780924,-122.268648,47.716634,-122.396536_rect/12_zm/"
}
# Parse the HTML Code
html<-htmlTreeParse(url,useInternalNodes = T)
# Look for HTML tag that divides the page
into 25 sections (since each page has 25 houses), helps in scraping and
maintaining relation between various attributes
# Get the XML for each
main <- getNodeSet(html, '//article[@role="article"]')
# Parse through each of the 25 parts and get street address
street<-lapply(main,
xpathSApply,'.//span[@itemprop="streetAddress"]',xmlValue) #the third term is
the unique identifier for Street Address
# No value or can't retrieve (privacy issues--sign in required) put 'NA'
street[sapply(street, is.list)] <- NA
street<-unlist(street)
# Concatenate all values from the 20 pages
streetdata<-c(streetdata,street[1:25]) #Zillow causes random duplication,
so restrict to the first 25 as
#others are duplicates (each page
has 25 houses)
# Above logic repeats for other attributes
# Property Address
propertyaddress<-lapply(main, xpathSApply,'.//dt[@class="propertyaddress"]',xmlValue)
propertyaddress[sapply(propertyaddress, is.list)] <- NA
propertyaddress<-unlist(propertyaddress)
propertyaddressdata<-c(propertyaddressdata,propertyaddress[1:25])
# Price
price<-lapply(main,xpathSApply,'.//dl[@class="property-info-list col-1
column"]/dt[@class="type-recentlySold type show-icon"]',xmlValue)
price[sapply(price, is.list)] <- NA
price<-unlist(price)
pricedata<-c(pricedata,price[1:25])
# Zillow Estimated Price (Not used as an
sttribute, but we wanted to check how close Zillow estimates the house
price and how much the house is actually sold for)
zest<-lapply(main,xpathSApply,'.//div[@class="zestimate"]',xmlValue)
zest[sapply(zest, is.list)] <- NA
zest<-unlist(zest)
zestdata<-c(zestdata,zest[1:25])
# House Sold Date
homeType<-lapply(main,xpathSApply,'.//dt[@class="sold-date"]',xmlValue)
homeType[sapply(homeType, is.list)] <- NA
homeType<-unlist(homeType)
homeTypedata<-c(homeTypedata,homeType[1:25])
# Locality
addressLocality<lapply(main,xpathSApply,'.//span[@itemprop="addressLocality"]',xmlValue)
addressLocality[sapply(addressLocality, is.list)] <- NA
addressLocality<-unlist(addressLocality)
addressLocalitydata<-c(addressLocalitydata,addressLocality[1:25])
# Region
addressRegion<lapply(main,xpathSApply,'.//span[@itemprop="addressRegion"]',xmlValue)
addressRegion[sapply(addressRegion, is.list)] <- NA
addressRegion<-unlist(addressRegion)
addressRegiondata<-c(addressRegiondata,addressRegion[1:25])
# Zipcode
postalCode<lapply(main,xpathSApply,'.//span[@itemprop="postalCode"]',xmlValue)
postalCode[sapply(postalCode, is.list)] <- NA
postalCode<-unlist(postalCode)
postalCodedata<-c(postalCodedata,postalCode[1:25])
# Latitude
latitude<-lapply(main,xpathSApply,'.//meta[@itemprop="latitude"]/@content')
latitude[sapply(latitude, is.list)] <- NA
latitude<-unlist(latitude)
latitudedata<-c(latitudedata,latitude[1:25])
# Longitude
longitude<lapply(main,xpathSApply,'.//meta[@itemprop="longitude"]/@content')
longitude[sapply(longitude, is.list)] <- NA
longitude<-unlist(longitude)
longitudedata<-c(longitudedata,longitude[1:25])
longitudedata
# Bed, Bath, Sqft---Property details no specific tag to break it down--Will
be cleaned later
propertydetails<-lapply(main,xpathSApply,'.//dt[@class="propertydata"]',xmlValue)
propertydetails[sapply(propertydetails, is.list)] <- NA
propertydetails<-unlist(propertydetails)
propertydetailsdata<-c(propertydetailsdata,propertydetails[1:25])
# Lot Area
lot<-lapply(main,xpathSApply,'.//dt[@class="property-lot"]',xmlValue)
lot[sapply(lot, is.list)] <- NA
lot<-unlist(lot)
lotdata<-c(lotdata,lot[1:25])
# Year House was Built
year<-lapply(main,xpathSApply,'.//dt[@class="property-year"]',xmlValue)
year[sapply(year, is.list)] <- NA
year<-unlist(year)
yeardata<-c(yeardata,year[1:25])
# Price per Sqft
price_sqft<-lapply(main,xpathSApply,'.//dd[@class="price-sqftb"]',xmlValue)
price_sqft[sapply(price_sqft, is.list)] <- NA
price_sqft<-unlist(price_sqft)
price_sqftdata<-c(price_sqftdata,price_sqft[1:25])
# clear out list as it appends past values in some cases
rm("street")
rm("price")
rm("zest")
rm("addressLocality")
rm("postalCode")
rm("addressRegion")
rm("latitude")
rm("longitude")
rm("propertydetails")
rm("homeType")
rm("lot")
rm("year")
rm("price_sqft")
rm("propertyaddress")
rm("url")
rm("main")
rm("html")
}
# create a data frame
Area1Sold<data.frame("Property_Address"=propertyaddressdata,"StreeAddress"=streetdata,
"Price"=pricedata, "Zillow_Estimate"=zestdata, "House_Type"=homeTypedata,
"Locality"=addressLocalitydata,
"Region"=addressRegiondata, "Postal"=postalCodedata, "Latitude"=latitudedata,
"Longitude"=longitudedata,
"PropertyDetails"=propertydetailsdata, "Lot_Area"=lotdata,
"Built_Year"=yeardata, "Price_Sqft"=price_sqftdata)
# Value entered by us to keep track of which area the data was obtained from
Area1Sold$Record<-rep("Area1 Sold",nrow(Area1Sold))
# Check data
View(Area1Sold)
nrow(Area1Sold)
# Remove Duplicates
Area1Sold<-Area1Sold[!duplicated(Area1Sold), ]
# Save as CSV File
write.csv(Area1Sold, "Area1Sold.csv")
#rm(list=ls())
Part 2:
# The 12 areas scraped can have duplicate values due to overlapping; merge
scrapped data and remove the duplicates
combine<-rbind(Area1Sold,Area2Sold,Area3Sold,Area4Sold,Area5Sold,Area6Sold,
Area7Sold,Area8Sold,Area9Sold,Area10Sold,Area11Sold,Area12Sold)
View(combine)
# Clear the last column, it was an entry that
we entered to keep track of which area(1 out of 12) the data was
obtained from. Also clear the first column that R adds.
combine$Record<-NULL
combine$row.names<-NULL
View(combine)
# Remove Duplicates
HouseSoldData<-combine[!duplicated(combine), ]
View(HouseSoldData)
# Save as CSV
write.csv(HouseSoldData, "HouseSoldData.csv")
#rm(list=ls())
2. Get School Data
#Get all schools within 5 mile radius of each house
#Seattle is a small city, so although 5 miles is a lot, we needed to get all
schools and not miss out on any, hence the radius.
#install.packages("rjson")
#install.packages("plyr") #needed for the function rbind.fill
# import library/ defining parameters
library(plyr)
library(rjson)
# Requested API key, differs with IP Address
eduAPIkey="e94e2c906c025ba5fc26fd12526ae602"
#eduAPIkey="8455c78313d430ee187149eefa5a43f7"
# Define a radius of 5 miles
radius=5
# Initialize variables
SchoolFinal<-vector()
SchoolData<-vector()
# Import the Data file
Sold<-read.csv("C:/Users/Pranali
Shetty/Desktop/OutputScripts/(HouseSoldData)FULL_R.csv")
# Get the number of rows in the data frame
n<-nrow(Sold)
# Inititalize Variable
f<-vector()
# Get the URL (which contains JSON strings) of all schools within 5 miles of
each House's Location (Latitude and Longitude)
for(i in 1:n)
{
resURL=paste("http://api.education.com/service/service.php?f=schoolSearch&key
=",eduAPIkey,
"&sn=sf&v=4&latitude=",Sold$Latitude[i],"&longitude=",Sold$Longitude[i],"&dis
tance=",radius,"&resf=json",
sep="")
# Parse JSON and get data
json=fromJSON(file=resURL, method='C')
f<-c(f,json)
}
# Attributes names as in JSON strings
fieldList=c("nces_id","schoolname","districleaid",
"zip","city","state","latitude",
"longitude","schooltype","testrating_text","gradelevel","studentteacherratio"
)
# Structure the data obtain (in list form) to a data frame
temp=sapply(f,function(temp) unlist(temp$school[fieldList]))
SchoolData=rbind.fill(lapply(temp,function(y)
as.data.frame(t(y),stringsAsFactors=FALSE)))
SchoolData$latitude=as.numeric(SchoolData$latitude)
SchoolData$longitude=as.numeric(SchoolData$longitude)
# Extract test rating numbers
SchoolData$testrating=as.numeric(gsub("[^09]","",SchoolData$testrating_text))
View(SchoolData)
#Remove Duplicates
SchoolData<-SchoolData[!duplicated(SchoolData), ]
View(SchoolData)
#save as CSV
write.csv(SchoolData, "SchoolData.csv")
#rm(list=ls())
3. Add Number of Schools and distance of nearest school to base
dataset (zillow houses)
# Get number of schools within 1km radius of House and get the distance of
the nearest school to the House
# import library/ defining parameters
library("aspace")
# Radius of Earth
R=6371
# Degree to Radian Conversion Unit
degtoradi=0.0174532925
# Initialize variables
school<-vector()
distance<-vector()
df<-NULL
# Read Data
Hou<-read.csv("HouseSold.csv")
sch<-read.csv("SchoolData.csv")
# Remove NA
Hou<-Hou[complete.cases(Hou$Latitude),]
Hou<-Hou[complete.cases(Hou$Longitude),]
sch<-sch[complete.cases(sch$latitude),]
sch<-sch[complete.cases(sch$longitude),]
# Gets Number of schools within 1km radius and distance of nearest school
for (i in 1:nrow(Hou)){
num=0
tempdist<-vector()
for (j in 1:nrow(sch))
{
# Haversion Formula to calculate distance
lat1=Hou$Latitude[i]*degtoradi
long1=Hou$Longitude[i]*degtoradi
lat2=sch$latitude[j]*degtoradi
long2=sch$longitude[j]*degtoradi
la = lat2-lat1
lo = long2-long1
a = sin(la/2)*sin(la/2)+cos(lat1)*cos(lat2)*sin(lo/2)*sin(lo/2)
c = 2*atan2(sqrt(a), sqrt(1-a))
dist = R * c
# Get distance of schools within 1km
if (dist<=1)
{
tempdist<-c(tempdist,dist)
}
}
# Number of schools within 1Km
num=length(tempdist)
# Get nearest school distance
if (num>0)
{
nearest<-min(tempdist)
}
else
{
nearest<-Inf
}
# Store Number of schools and distance of nearest school
school<-c(school,num)
distance<-c(distance,nearest)
}
#Add Number of schools and distance of nearest school to the main data frame
df<-data.frame(Hou,school,distance)
View(df)
# Save as CSV
write.csv(df,"HouseSoldSchool.csv")
4. Median Age
# Get the Median Age as per Zipcode using US Census 2010 API
# import library/ defining parameters
library(rjson)
# US Census Data 2010 State Code: Washington
state=53
# Requested API key
APIkey="862a8c646c8b573a4bb0ddbe1a03c699da402e50"
# The format to access the Median Age using this API
code="P0130001"
# URL formal for the API
webURL=paste("http://api.census.gov/data/2010/sf1?get=",code,
"&for=zip+code+tabulation+area:*&in=state:",state,"&key=",
APIkey,sep="")
# Parse JSON
json=fromJSON(file=webURL, method='C')
# get json strings from the 2nd row, 1st is the header
json=json[2:length(json)]
# Zipcode is the 3rd column
zipcode=sapply(json,function(r) r[3])
# Age is the 1st column
Age=sapply(json,function(r) r[1])
# Create data frame
MedAgeData=data.frame(zipcode,as.numeric(Age))
# Define header names
names(MedAgeData)=c("Zipcode","MedAge")
View(MedAgeData)
# Read Data
House<-read.csv("C:/Users/Pranali Shetty/Desktop/OutputScripts/Final.csv")
# Remove NA
HouseAge<-House[complete.cases(House$Postal),]
# (Join Tables) Get Median Age for each Zipcode in the base dataset (Zillow
Houses)
for(i in 1:nrow(HouseAge))
{
for(j in 1:nrow(MedAgeData))
{
# If Zipcode match, get the median age
if(HouseAge$Postal[i]==MedAgeData$Zipcode[j])
{
HouseAge$MedAge[i]<-MedAgeData$MedAge[j]
}
}
}
View(HouseAge)
# Save as CSV
write.csv(HouseAge, "HouseFinal.csv")
5. Median Income
# Get the Median Income as per Zipcode using US Census 2011 API
# import library/ defining parameters
library(rjson)
library(plyr)
# Requested API key
APIkey="862a8c646c8b573a4bb0ddbe1a03c699da402e50"
# The format to access the Median Income using this API
code="B19013_001E"
# URL formal for the API
webURL=paste("http://api.census.gov/data/2011/acs5?get=",code,"&for=zip+code+
tabulation+area:*&key=",APIkey,sep="")
# Parse JSON
json=fromJSON(file=webURL, method='C')
# get json strings from the 2nd row, 1st is the header
json=json[2:length(json)]
# Zipcode is the 2nd column
zipcode=as.character(sapply(json,function(r) r[2]))
# Income is the 1st column
Income=as.character(sapply(json,function(r) r[1]))
# Create data frame
IncomeData=data.frame(zipcode,as.numeric(Income)) #NA introduced by coercion
# Define header names
names(IncomeData)=c("Zipcode","MedIncome")
View(IncomeData)
# Save as CSV
write.csv(IncomeData, "IncomeData.csv")
# Read Data
House<-read.csv("C:/Users/Pranali Shetty/Desktop/OutputScripts/Final.csv")
IncomeData<-read.csv("C:/Users/Pranali
Shetty/Desktop/OutputScripts/sIncome.csv")
# Remove NA
HouseIncome<-House[complete.cases(House$Postal),]
# Compare on Zipcode and Get Median Income for each Zipcode
for(i in 1:nrow(HouseIncome))
{
for(j in 1:nrow(IncomeData))
{
if(HouseIncome$Postal[i]==IncomeData$Zipcode[j])
{
HouseIncome$MedIncome[i]=IncomeData$MedIncome[j]
}
}
}
View(HouseIncome)
# Save as CSV
write.csv(HouseIncome, "Final.csv")
6. Neighborhood Data (Environmental features and Transportation
features)
# As stated in the writeup the neighborhood data was divided into beach,
waterfront, monorail etc. so we divided the csv file into those sections and
then run the code for each. Below code is just for the beaches.
# Code calculates the Number of beach within 0.5 km of each house and
distance of the nearest beach
# import library/ defining parameters
library("aspace")
# Radius of Earth
R=6371
# Degree to Radian Conversion Unit
degtoradi=0.0174532925
# Initialize variables
envi<-vector()
distance<-vector()
df<-NULL
# Read Data
nbea<-read.csv("neighborhood_envi2.csv")
Hou<-read.csv("HouseSoldClean.csv")
# Remove NA
Hou<-Hou[!(is.na(Hou$Longitude)) | !(is.na(Hou$Latitude)),]
# Gets Number of beaches within 0.5 km radius and distance of nearest beach
for (i in 1:nrow(Hou)){
num=0
tempdist<-vector()
for (j in 1:nrow(nbea))
{
# Haversion Formula to calculate distance
lat1=Hou$Latitude[i]*degtoradi
long1=Hou$Longitude[i]*degtoradi
lat2=nbea$Latitude[j]*degtoradi
long2=nbea$Longitude[j]*degtoradi
la = lat2-lat1
lo = long2-long1
a = sin(la/2)*sin(la/2)+cos(lat1)*cos(lat2)*sin(lo/2)*sin(lo/2)
c = 2*atan2(sqrt(a), sqrt(1-a))
dist = R * c
# Get distance of beaches within 0.5 km
if (dist<=0.5)
{
tempdist<-c(tempdist,dist)
}
}
# Number of beaches within 0.5Km
num=length(tempdist)
# Get nearest beach distance
if (num>0)
{
nearest<-min(tempdist)
}
else
{
nearest<-Inf
}
# Store Number of beaches and distance of nearest beach
envi<-c(envi,num)
distance<-c(distance,nearest)
}
# Add Number of beach and distance of nearest beach to the main data frame
df<-data.frame(Hou,envi,distance)
View(df)
#Save as CSV
write.csv(df,"beach.csv")
7. Crime Data
# Get number of crimes within 3 km radius of House
# import library/ defining parameters
library("aspace")
# Radius of Earth
R=6371
# Degree to Radian Conversion Unit
degtoradi=0.0174532925
# Initialize variables
envi<-vector()
df<-NULL
# Read Data
Hou<-read.csv("HouseSold.csv")
nei<-read.csv("crime.csv")
# Remove NA
Hou<-Hou[!(is.na(Hou$Longitude)) | !(is.na(Hou$Latitude)),]
nei<-nei[!(is.na(nei$Longitude)) | !(is.na(nei$Latitude)),]
# Gets Number of crimes within 3 km radius
for (i in 1:nrow(Hou)){
num=0
for (j in 1:nrow(nei))
{
# Haversion Formula to calculate distance
lat1=Hou$Latitude[i]*degtoradi
long1=Hou$Longitude[i]*degtoradi
lat2=nei$Latitude[j]*degtoradi
long2=nei$Longitude[j]*degtoradi
la = lat2-lat1
lo = long2-long1
a = sin(la/2)*sin(la/2)+cos(lat1)*cos(lat2)*sin(lo/2)*sin(lo/2)
c = 2*atan2(sqrt(a), sqrt(1-a))
dist = R * c
# Get number of crime within 3 km
if (dist<=3)
{
num=num+1
}
}
# Store data
envi<-c(envi,num)
}
# Add Number of crimes main data frame
df<-data.frame(Hou,envi)
View(df)
# Save CSV
write.csv(df,"houseCrime.csv")
Download