Data Collection Code 1. Zillow Scraper Part 1: # Zoomed in on each area, then used that URL to scrape data # Zillow shows 25 Houses per page and number of pages depends on the Seattle Area you select. # Usually there are 25 pages however, in the URL gives valid results for i=40, so we used 40 to get as much data as possible and then we removed duplicates # Area1 Sold ----We repeated this for 12 different areas to get houses from every section of Seattle # install.packages("XML") # install.packages("RCurl") # import library/ defining parameters library(XML) library(RCurl) # initialize variables to an empty vectors streetdata<-vector() pricedata<-vector() zestdata<-vector() addressLocalitydata<-vector() postalCodedata<-vector() addressRegiondata<-vector() propertydetailsdata<-vector() latitudedata<-vector() longitudedata<-vector() homeTypedata<-vector() lotdata<-vector() yeardata<-vector() price_sqftdata<-vector() propertyaddressdata<-vector() # Number of pages you want to scrape x=40 # Iteration to get the URL name and then scrape the data from each page. for(i in 1:x) { #URL for all other pages expect the 1st if(i!=1) { tempurl=c("http://www.zillow.com/homes/recently_sold/Seattle-WA/ 16037_rid/47.780924,-122.268648,47.716634,122.396536_rect/12_zm/",i,"_p/") #Custom URL for the particular section of Seattle we considered url=paste(tempurl,sep='',collapse='') #Create the URL, convert parts of the list(tempurl) into one form } # URL for the 1st page is different else { url="http://www.zillow.com/homes/recently_sold/Seattle-WA/ 16037_rid/47.780924,-122.268648,47.716634,-122.396536_rect/12_zm/" } # Parse the HTML Code html<-htmlTreeParse(url,useInternalNodes = T) # Look for HTML tag that divides the page into 25 sections (since each page has 25 houses), helps in scraping and maintaining relation between various attributes # Get the XML for each main <- getNodeSet(html, '//article[@role="article"]') # Parse through each of the 25 parts and get street address street<-lapply(main, xpathSApply,'.//span[@itemprop="streetAddress"]',xmlValue) #the third term is the unique identifier for Street Address # No value or can't retrieve (privacy issues--sign in required) put 'NA' street[sapply(street, is.list)] <- NA street<-unlist(street) # Concatenate all values from the 20 pages streetdata<-c(streetdata,street[1:25]) #Zillow causes random duplication, so restrict to the first 25 as #others are duplicates (each page has 25 houses) # Above logic repeats for other attributes # Property Address propertyaddress<-lapply(main, xpathSApply,'.//dt[@class="propertyaddress"]',xmlValue) propertyaddress[sapply(propertyaddress, is.list)] <- NA propertyaddress<-unlist(propertyaddress) propertyaddressdata<-c(propertyaddressdata,propertyaddress[1:25]) # Price price<-lapply(main,xpathSApply,'.//dl[@class="property-info-list col-1 column"]/dt[@class="type-recentlySold type show-icon"]',xmlValue) price[sapply(price, is.list)] <- NA price<-unlist(price) pricedata<-c(pricedata,price[1:25]) # Zillow Estimated Price (Not used as an sttribute, but we wanted to check how close Zillow estimates the house price and how much the house is actually sold for) zest<-lapply(main,xpathSApply,'.//div[@class="zestimate"]',xmlValue) zest[sapply(zest, is.list)] <- NA zest<-unlist(zest) zestdata<-c(zestdata,zest[1:25]) # House Sold Date homeType<-lapply(main,xpathSApply,'.//dt[@class="sold-date"]',xmlValue) homeType[sapply(homeType, is.list)] <- NA homeType<-unlist(homeType) homeTypedata<-c(homeTypedata,homeType[1:25]) # Locality addressLocality<lapply(main,xpathSApply,'.//span[@itemprop="addressLocality"]',xmlValue) addressLocality[sapply(addressLocality, is.list)] <- NA addressLocality<-unlist(addressLocality) addressLocalitydata<-c(addressLocalitydata,addressLocality[1:25]) # Region addressRegion<lapply(main,xpathSApply,'.//span[@itemprop="addressRegion"]',xmlValue) addressRegion[sapply(addressRegion, is.list)] <- NA addressRegion<-unlist(addressRegion) addressRegiondata<-c(addressRegiondata,addressRegion[1:25]) # Zipcode postalCode<lapply(main,xpathSApply,'.//span[@itemprop="postalCode"]',xmlValue) postalCode[sapply(postalCode, is.list)] <- NA postalCode<-unlist(postalCode) postalCodedata<-c(postalCodedata,postalCode[1:25]) # Latitude latitude<-lapply(main,xpathSApply,'.//meta[@itemprop="latitude"]/@content') latitude[sapply(latitude, is.list)] <- NA latitude<-unlist(latitude) latitudedata<-c(latitudedata,latitude[1:25]) # Longitude longitude<lapply(main,xpathSApply,'.//meta[@itemprop="longitude"]/@content') longitude[sapply(longitude, is.list)] <- NA longitude<-unlist(longitude) longitudedata<-c(longitudedata,longitude[1:25]) longitudedata # Bed, Bath, Sqft---Property details no specific tag to break it down--Will be cleaned later propertydetails<-lapply(main,xpathSApply,'.//dt[@class="propertydata"]',xmlValue) propertydetails[sapply(propertydetails, is.list)] <- NA propertydetails<-unlist(propertydetails) propertydetailsdata<-c(propertydetailsdata,propertydetails[1:25]) # Lot Area lot<-lapply(main,xpathSApply,'.//dt[@class="property-lot"]',xmlValue) lot[sapply(lot, is.list)] <- NA lot<-unlist(lot) lotdata<-c(lotdata,lot[1:25]) # Year House was Built year<-lapply(main,xpathSApply,'.//dt[@class="property-year"]',xmlValue) year[sapply(year, is.list)] <- NA year<-unlist(year) yeardata<-c(yeardata,year[1:25]) # Price per Sqft price_sqft<-lapply(main,xpathSApply,'.//dd[@class="price-sqftb"]',xmlValue) price_sqft[sapply(price_sqft, is.list)] <- NA price_sqft<-unlist(price_sqft) price_sqftdata<-c(price_sqftdata,price_sqft[1:25]) # clear out list as it appends past values in some cases rm("street") rm("price") rm("zest") rm("addressLocality") rm("postalCode") rm("addressRegion") rm("latitude") rm("longitude") rm("propertydetails") rm("homeType") rm("lot") rm("year") rm("price_sqft") rm("propertyaddress") rm("url") rm("main") rm("html") } # create a data frame Area1Sold<data.frame("Property_Address"=propertyaddressdata,"StreeAddress"=streetdata, "Price"=pricedata, "Zillow_Estimate"=zestdata, "House_Type"=homeTypedata, "Locality"=addressLocalitydata, "Region"=addressRegiondata, "Postal"=postalCodedata, "Latitude"=latitudedata, "Longitude"=longitudedata, "PropertyDetails"=propertydetailsdata, "Lot_Area"=lotdata, "Built_Year"=yeardata, "Price_Sqft"=price_sqftdata) # Value entered by us to keep track of which area the data was obtained from Area1Sold$Record<-rep("Area1 Sold",nrow(Area1Sold)) # Check data View(Area1Sold) nrow(Area1Sold) # Remove Duplicates Area1Sold<-Area1Sold[!duplicated(Area1Sold), ] # Save as CSV File write.csv(Area1Sold, "Area1Sold.csv") #rm(list=ls()) Part 2: # The 12 areas scraped can have duplicate values due to overlapping; merge scrapped data and remove the duplicates combine<-rbind(Area1Sold,Area2Sold,Area3Sold,Area4Sold,Area5Sold,Area6Sold, Area7Sold,Area8Sold,Area9Sold,Area10Sold,Area11Sold,Area12Sold) View(combine) # Clear the last column, it was an entry that we entered to keep track of which area(1 out of 12) the data was obtained from. Also clear the first column that R adds. combine$Record<-NULL combine$row.names<-NULL View(combine) # Remove Duplicates HouseSoldData<-combine[!duplicated(combine), ] View(HouseSoldData) # Save as CSV write.csv(HouseSoldData, "HouseSoldData.csv") #rm(list=ls()) 2. Get School Data #Get all schools within 5 mile radius of each house #Seattle is a small city, so although 5 miles is a lot, we needed to get all schools and not miss out on any, hence the radius. #install.packages("rjson") #install.packages("plyr") #needed for the function rbind.fill # import library/ defining parameters library(plyr) library(rjson) # Requested API key, differs with IP Address eduAPIkey="e94e2c906c025ba5fc26fd12526ae602" #eduAPIkey="8455c78313d430ee187149eefa5a43f7" # Define a radius of 5 miles radius=5 # Initialize variables SchoolFinal<-vector() SchoolData<-vector() # Import the Data file Sold<-read.csv("C:/Users/Pranali Shetty/Desktop/OutputScripts/(HouseSoldData)FULL_R.csv") # Get the number of rows in the data frame n<-nrow(Sold) # Inititalize Variable f<-vector() # Get the URL (which contains JSON strings) of all schools within 5 miles of each House's Location (Latitude and Longitude) for(i in 1:n) { resURL=paste("http://api.education.com/service/service.php?f=schoolSearch&key =",eduAPIkey, "&sn=sf&v=4&latitude=",Sold$Latitude[i],"&longitude=",Sold$Longitude[i],"&dis tance=",radius,"&resf=json", sep="") # Parse JSON and get data json=fromJSON(file=resURL, method='C') f<-c(f,json) } # Attributes names as in JSON strings fieldList=c("nces_id","schoolname","districleaid", "zip","city","state","latitude", "longitude","schooltype","testrating_text","gradelevel","studentteacherratio" ) # Structure the data obtain (in list form) to a data frame temp=sapply(f,function(temp) unlist(temp$school[fieldList])) SchoolData=rbind.fill(lapply(temp,function(y) as.data.frame(t(y),stringsAsFactors=FALSE))) SchoolData$latitude=as.numeric(SchoolData$latitude) SchoolData$longitude=as.numeric(SchoolData$longitude) # Extract test rating numbers SchoolData$testrating=as.numeric(gsub("[^09]","",SchoolData$testrating_text)) View(SchoolData) #Remove Duplicates SchoolData<-SchoolData[!duplicated(SchoolData), ] View(SchoolData) #save as CSV write.csv(SchoolData, "SchoolData.csv") #rm(list=ls()) 3. Add Number of Schools and distance of nearest school to base dataset (zillow houses) # Get number of schools within 1km radius of House and get the distance of the nearest school to the House # import library/ defining parameters library("aspace") # Radius of Earth R=6371 # Degree to Radian Conversion Unit degtoradi=0.0174532925 # Initialize variables school<-vector() distance<-vector() df<-NULL # Read Data Hou<-read.csv("HouseSold.csv") sch<-read.csv("SchoolData.csv") # Remove NA Hou<-Hou[complete.cases(Hou$Latitude),] Hou<-Hou[complete.cases(Hou$Longitude),] sch<-sch[complete.cases(sch$latitude),] sch<-sch[complete.cases(sch$longitude),] # Gets Number of schools within 1km radius and distance of nearest school for (i in 1:nrow(Hou)){ num=0 tempdist<-vector() for (j in 1:nrow(sch)) { # Haversion Formula to calculate distance lat1=Hou$Latitude[i]*degtoradi long1=Hou$Longitude[i]*degtoradi lat2=sch$latitude[j]*degtoradi long2=sch$longitude[j]*degtoradi la = lat2-lat1 lo = long2-long1 a = sin(la/2)*sin(la/2)+cos(lat1)*cos(lat2)*sin(lo/2)*sin(lo/2) c = 2*atan2(sqrt(a), sqrt(1-a)) dist = R * c # Get distance of schools within 1km if (dist<=1) { tempdist<-c(tempdist,dist) } } # Number of schools within 1Km num=length(tempdist) # Get nearest school distance if (num>0) { nearest<-min(tempdist) } else { nearest<-Inf } # Store Number of schools and distance of nearest school school<-c(school,num) distance<-c(distance,nearest) } #Add Number of schools and distance of nearest school to the main data frame df<-data.frame(Hou,school,distance) View(df) # Save as CSV write.csv(df,"HouseSoldSchool.csv") 4. Median Age # Get the Median Age as per Zipcode using US Census 2010 API # import library/ defining parameters library(rjson) # US Census Data 2010 State Code: Washington state=53 # Requested API key APIkey="862a8c646c8b573a4bb0ddbe1a03c699da402e50" # The format to access the Median Age using this API code="P0130001" # URL formal for the API webURL=paste("http://api.census.gov/data/2010/sf1?get=",code, "&for=zip+code+tabulation+area:*&in=state:",state,"&key=", APIkey,sep="") # Parse JSON json=fromJSON(file=webURL, method='C') # get json strings from the 2nd row, 1st is the header json=json[2:length(json)] # Zipcode is the 3rd column zipcode=sapply(json,function(r) r[3]) # Age is the 1st column Age=sapply(json,function(r) r[1]) # Create data frame MedAgeData=data.frame(zipcode,as.numeric(Age)) # Define header names names(MedAgeData)=c("Zipcode","MedAge") View(MedAgeData) # Read Data House<-read.csv("C:/Users/Pranali Shetty/Desktop/OutputScripts/Final.csv") # Remove NA HouseAge<-House[complete.cases(House$Postal),] # (Join Tables) Get Median Age for each Zipcode in the base dataset (Zillow Houses) for(i in 1:nrow(HouseAge)) { for(j in 1:nrow(MedAgeData)) { # If Zipcode match, get the median age if(HouseAge$Postal[i]==MedAgeData$Zipcode[j]) { HouseAge$MedAge[i]<-MedAgeData$MedAge[j] } } } View(HouseAge) # Save as CSV write.csv(HouseAge, "HouseFinal.csv") 5. Median Income # Get the Median Income as per Zipcode using US Census 2011 API # import library/ defining parameters library(rjson) library(plyr) # Requested API key APIkey="862a8c646c8b573a4bb0ddbe1a03c699da402e50" # The format to access the Median Income using this API code="B19013_001E" # URL formal for the API webURL=paste("http://api.census.gov/data/2011/acs5?get=",code,"&for=zip+code+ tabulation+area:*&key=",APIkey,sep="") # Parse JSON json=fromJSON(file=webURL, method='C') # get json strings from the 2nd row, 1st is the header json=json[2:length(json)] # Zipcode is the 2nd column zipcode=as.character(sapply(json,function(r) r[2])) # Income is the 1st column Income=as.character(sapply(json,function(r) r[1])) # Create data frame IncomeData=data.frame(zipcode,as.numeric(Income)) #NA introduced by coercion # Define header names names(IncomeData)=c("Zipcode","MedIncome") View(IncomeData) # Save as CSV write.csv(IncomeData, "IncomeData.csv") # Read Data House<-read.csv("C:/Users/Pranali Shetty/Desktop/OutputScripts/Final.csv") IncomeData<-read.csv("C:/Users/Pranali Shetty/Desktop/OutputScripts/sIncome.csv") # Remove NA HouseIncome<-House[complete.cases(House$Postal),] # Compare on Zipcode and Get Median Income for each Zipcode for(i in 1:nrow(HouseIncome)) { for(j in 1:nrow(IncomeData)) { if(HouseIncome$Postal[i]==IncomeData$Zipcode[j]) { HouseIncome$MedIncome[i]=IncomeData$MedIncome[j] } } } View(HouseIncome) # Save as CSV write.csv(HouseIncome, "Final.csv") 6. Neighborhood Data (Environmental features and Transportation features) # As stated in the writeup the neighborhood data was divided into beach, waterfront, monorail etc. so we divided the csv file into those sections and then run the code for each. Below code is just for the beaches. # Code calculates the Number of beach within 0.5 km of each house and distance of the nearest beach # import library/ defining parameters library("aspace") # Radius of Earth R=6371 # Degree to Radian Conversion Unit degtoradi=0.0174532925 # Initialize variables envi<-vector() distance<-vector() df<-NULL # Read Data nbea<-read.csv("neighborhood_envi2.csv") Hou<-read.csv("HouseSoldClean.csv") # Remove NA Hou<-Hou[!(is.na(Hou$Longitude)) | !(is.na(Hou$Latitude)),] # Gets Number of beaches within 0.5 km radius and distance of nearest beach for (i in 1:nrow(Hou)){ num=0 tempdist<-vector() for (j in 1:nrow(nbea)) { # Haversion Formula to calculate distance lat1=Hou$Latitude[i]*degtoradi long1=Hou$Longitude[i]*degtoradi lat2=nbea$Latitude[j]*degtoradi long2=nbea$Longitude[j]*degtoradi la = lat2-lat1 lo = long2-long1 a = sin(la/2)*sin(la/2)+cos(lat1)*cos(lat2)*sin(lo/2)*sin(lo/2) c = 2*atan2(sqrt(a), sqrt(1-a)) dist = R * c # Get distance of beaches within 0.5 km if (dist<=0.5) { tempdist<-c(tempdist,dist) } } # Number of beaches within 0.5Km num=length(tempdist) # Get nearest beach distance if (num>0) { nearest<-min(tempdist) } else { nearest<-Inf } # Store Number of beaches and distance of nearest beach envi<-c(envi,num) distance<-c(distance,nearest) } # Add Number of beach and distance of nearest beach to the main data frame df<-data.frame(Hou,envi,distance) View(df) #Save as CSV write.csv(df,"beach.csv") 7. Crime Data # Get number of crimes within 3 km radius of House # import library/ defining parameters library("aspace") # Radius of Earth R=6371 # Degree to Radian Conversion Unit degtoradi=0.0174532925 # Initialize variables envi<-vector() df<-NULL # Read Data Hou<-read.csv("HouseSold.csv") nei<-read.csv("crime.csv") # Remove NA Hou<-Hou[!(is.na(Hou$Longitude)) | !(is.na(Hou$Latitude)),] nei<-nei[!(is.na(nei$Longitude)) | !(is.na(nei$Latitude)),] # Gets Number of crimes within 3 km radius for (i in 1:nrow(Hou)){ num=0 for (j in 1:nrow(nei)) { # Haversion Formula to calculate distance lat1=Hou$Latitude[i]*degtoradi long1=Hou$Longitude[i]*degtoradi lat2=nei$Latitude[j]*degtoradi long2=nei$Longitude[j]*degtoradi la = lat2-lat1 lo = long2-long1 a = sin(la/2)*sin(la/2)+cos(lat1)*cos(lat2)*sin(lo/2)*sin(lo/2) c = 2*atan2(sqrt(a), sqrt(1-a)) dist = R * c # Get number of crime within 3 km if (dist<=3) { num=num+1 } } # Store data envi<-c(envi,num) } # Add Number of crimes main data frame df<-data.frame(Hou,envi) View(df) # Save CSV write.csv(df,"houseCrime.csv")