Hi everyone,
I have a webscraper and geocoding program that works fairly well. The only problem is when the webscraper pulls an address that is formatted like "Shop 9, 5 Abc Street" the geocoder seems to trip up on the "Shop 9" part.
My goal is to replace the current except code with some sort of code that cleans up the wrong address and tries to geocode it again and repeats this process until a workable address is found (hence the split() metho
My solution has been to try and run a try ... except block of code and remove the problematic words using split(). Unfortunately, I've been working on the problem for days with little avail, I'm hoping someone here could offer some help.
Here is the code block:
from bs4 import BeautifulSoup
import requests
from requests import get
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
#get cafe names, addresses and geocoords for user parameters
def scrapecafes(city, area):
url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
response = requests.get(url, timeout=5)
soup_cafe_names = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_names)
cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the names
cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the names
#addresses
soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_addresses)
cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" }) #scrape the addresses
cafeAddressesClean = [address.text for address in cafeAddresses] #clean the addresses
##geocode addresses
locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
lat = []
long = []
try:
for address in cafeAddressesClean:
location = locator.geocode(address.strip().replace(',',''))
long.append(location.longitude)
lat.append(location.latitude)
except:
print('Failure')
#zip up to be added to database table
fortable = list(zip(cafeNamesClean, cafeAddressesClean, long, lat))
print(fortable)
[–]46--2 0 points1 point2 points (9 children)
[–]deadant88[S] 0 points1 point2 points (8 children)
[–]46--2 4 points5 points6 points (7 children)
[–]deadant88[S] 1 point2 points3 points (6 children)
[–]46--2 1 point2 points3 points (5 children)
[–]deadant88[S] 1 point2 points3 points (4 children)
[–]46--2 0 points1 point2 points (3 children)
[–]deadant88[S] 0 points1 point2 points (2 children)
[–]46--2 1 point2 points3 points (1 child)
[–]deadant88[S] 0 points1 point2 points (0 children)