diff --git a/Visual Studio Code Projects/zillow-rental-data.py b/Visual Studio Code Projects/zillow-rental-data.py new file mode 100644 index 0000000..6874ba4 --- /dev/null +++ b/Visual Studio Code Projects/zillow-rental-data.py @@ -0,0 +1,68 @@ +from bs4 import BeautifulSoup +import requests +from selenium import webdriver +from selenium.webdriver.common.by import By +import time + +# Part 1 - Scrape the links, addresses, and prices of the rental properties + +header = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36", + "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8" +} + +# Use our Zillow-Clone website (instead of Zillow.com) +response = requests.get("https://appbrewery.github.io/Zillow-Clone/", headers=header) + +data = response.text +soup = BeautifulSoup(data, "html.parser") + +# Create a list of all the links on the page using a CSS Selector +all_link_elements = soup.select(".StyledPropertyCardDataWrapper a") +# Python list comprehension (covered in Day 26) +all_links = [link["href"] for link in all_link_elements] +print(f"There are {len(all_links)} links to individual listings in total: \n") +print(all_links) + +# Create a list of all the addresses on the page using a CSS Selector +# Remove newlines \n, pipe symbols |, and whitespaces to clean up the address data +all_address_elements = soup.select(".StyledPropertyCardDataWrapper address") +all_addresses = [address.get_text().replace(" | ", " ").strip() for address in all_address_elements] +print(f"\n After having been cleaned up, the {len(all_addresses)} addresses now look like this: \n") +print(all_addresses) + +# Create a list of all the prices on the page using a CSS Selector +# Get a clean dollar price and strip off any "+" symbols and "per month" /mo abbreviation +all_price_elements = soup.select(".PropertyCardWrapper span") +all_prices = [price.get_text().replace("/mo", "").split("+")[0] for price in all_price_elements if "$" in price.text] +print(f"\n After having been cleaned up, the {len(all_prices)} prices now look like this: \n") +print(all_prices) + + +# Part 2 - Fill in the Google Form using Selenium + +# Optional - Keep the browser open (helps diagnose issues if the script crashes) +chrome_options = webdriver.ChromeOptions() +chrome_options.add_experimental_option("detach", True) +driver = webdriver.Chrome(options=chrome_options) + +for n in range(len(all_links)): + # TODO: Add fill in the link to your own Google From + driver.get("YOUR_GOOGLE_FORM_LINK_HERE") + time.sleep(2) + + # Use the xpath to select the "short answer" fields in your Google Form. + # Note, your xpath might be different if you created a different form. + address = driver.find_element(by=By.XPATH, + value='//*[@id="mG61Hd"]/div[2]/div/div[2]/div[1]/div/div/div[2]/div/div[1]/div/div[1]/input') + price = driver.find_element(by=By.XPATH, + value='//*[@id="mG61Hd"]/div[2]/div/div[2]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/input') + link = driver.find_element(by=By.XPATH, + value='//*[@id="mG61Hd"]/div[2]/div/div[2]/div[3]/div/div/div[2]/div/div[1]/div/div[1]/input') + submit_button = driver.find_element(by=By.XPATH, + value='//*[@id="mG61Hd"]/div[2]/div/div[3]/div[1]/div[1]/div') + + address.send_keys(all_addresses[n]) + price.send_keys(all_prices[n]) + link.send_keys(all_links[n]) + submit_button.click() \ No newline at end of file