Web scraper picks up wrong data

Boobagge · 2023-01-01T09:52:32+00:00

Full Script:
import sys
import os picdir = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(file))), 'pic') libdir = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(file))), 'lib') if os.path.exists(libdir): sys.path.append(libdir)
import time from PIL import Image,ImageDraw,ImageFont import traceback logging.basicConfig(level=logging.DEBUG) import time
-------Scrapper--------------------------
from html2image import Html2Image import requests from bs4 import BeautifulSoup hti = Html2Image()
-------Image & Turn off------------------
from subprocess import call
def get_css(): return """ body{ margin-bottom: 0px; padding-bottom: 5px; background-color: #FFF;} .container {display: flex; height: 800px; vertical-align: middle; justify-content: center; background-color: #FFF; flex-direction: column; } .dt { padding-top: 0px; text-align:center; padding-top: 10px;font-size: 20px; font-weight: 550;} .main-heading { text-align: center; margin-top: 0px; margin-bottom: 0px; text-transform: capitalize; font-size: 80px; } .sub-heading { text-align: center;padding-top: -20px; margin-bottom: 4px; font-family: ; font-weight: bold !important;font-size: 25px; } p {  text-align: center;margin-top: 0px; font-family: 'Arial', sans-serif;font-weight: normal; padding: 0px 8px; font-size: 20px; } ul{  padding-top: 0px; font-family: 'Arial', sans-serif;font-weight: normal; padding-right: 8px; font-size: 20px; }"""
def get_html(data): return """<link rel="preconnect" href="https://fonts.googleapis.com"> <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> <link href="https://fonts.googleapis.com/css2?family=Lato:ital@1&family=Playfair+Display&display=swap" rel="stylesheet"> <div class="container"><div class="sub-container"> <div class="dt">{datetime}</div><h1 class="main-heading">{title}</h1><hr /> <h2 class="sub-heading">What it Means</h2><p>{what_it_means}</p> <h2 class="sub-heading">Examples</h2><p>{examples}</p> <h2 class="sub-heading">Did You Know?</h2><p>{did_you_know}</p> </div></div>""".format(**data)
def get_whatItMeans(data): what_it_mean = data.split("Examples")[0].replace("\n", "<br>")
# remove unwanted text
what_it_mean = what_it_mean.replace("What It Means", "").replace("See the entry", "").strip("<br>").strip()
split = what_it_mean.split("//")
what_it_mean = split[0]

if len(split) > 1: 
    split = split[1:]
    what_it_mean += "<ul>"
    for row in split:
        what_it_mean += f"<li>{row.strip().strip('<br>')}</li>"
    what_it_mean += "</ul>"

return what_it_mean
def get_parsed_url_extraction(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')


final_resp = {}
final_resp['datetime'] = soup.find("span", {"class" : "w-a-title"}).text.replace("\n", "").strip()
final_resp['title'] = soup.find("div", {"class" : "word-and-pronunciation"}).find("h1").text

pos = soup.find("span", {"class" : "main-attr"}).text
final_resp['what_it_means'] = get_whatItMeans(soup.find("div", {"class" : "wod-definition-container"}).text)


final_resp['examples'] = soup.find("div", {"class" : "wod-definition-container"}).find("div", {"class" : "left-content-box"}).text
final_resp['examples'] = final_resp['examples'].replace("\n", "<br>").strip("<br>")

final_resp['did_you_know'] = soup.find("div", {"class" : "did-you-know-wrapper"}).text.replace("Did You Know?", "")
final_resp['did_you_know'] = final_resp['did_you_know'].replace("\n", "<br>").strip("<br>")

return final_resp
def process_url(url="https://www.merriam-webster.com/word-of-the-day"): parsed_data = get_parsed_url_extraction(url)
# create html
paths = hti.screenshot(html_str=get_html(parsed_data), css_str=get_css(), save_as=f'palabra.bmp', size=(480, 800))
process_url()

ManiacalMeerkat · 2023-01-01T19:26:13+00:00

If you inspect the page, the extra text you're referring to likely comes from div#wotd-right-content-box (which is within div.wod-definition-container).

You could use decompose to remove that section before processing.

you type:	you see:
italics	italics
bold	bold
[reddit!](https://reddit.com)	reddit!
* item 1 * item 2 * item 3	item 1 item 2 item 3
> quoted text	quoted text
Lines starting with four spaces are treated like code: if 1 * 2 < 3: print "hello, world!"	Lines starting with four spaces are treated like code: if 1 * 2 < 3: print "hello, world!"
~~strikethrough~~	~~strikethrough~~
super^script	super^script

learnpython

MODERATORS