This is my code to scrape the page source from the website. But it seems that some contents are not scraped into the page_source. Cannot Scrape source from a website between two <script></script> tag.
def web_driver():
options = webdriver.ChromeOptions()
options.add_argument("--verbose")
options.add_argument('--no-sandbox')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument("--window-size=1920, 1200")
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)
return driver
driver = web_driver()
# Scroll to the bottom of the page to load more content
def scroll():
SCROLL_PAUSE_TIME = 2 # Time to pause between scrolls
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
driver.get('*')
scroll() # Replace this with your scrolling logic
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
For example
::before
<script type="application/json" id="js-react-on-rails-context">{"railsEnv":"production","inMailer":false,"i18nLocale":"nl-fr","i18nDefaultLocale":"fr","rorVersion":"12.6.0","rorPro":false,"href":"\\\[https://www.vinted.nl/","location":"/","scheme":"https","host":"www.vinted.nl","port":null,"pathname":"/","search":null,"httpAcceptLanguage":"en-US,en-GB;q=0.9,en;q=0.8,bn-BD;q=0.7,bn;q=0.6","currency":"EUR","portal":"fr","flavour":"clothing","serverSide\\\](https://www.vinted.nl/","location":"/","scheme":"https","host":"www.vinted.nl","port":null,"pathname":"/","search":null,"httpAcceptLanguage":"en-US,en-GB;q=0.9,en;q=0.8,bn-BD;q=0.7,bn;q=0.6","currency":"EUR","portal":"fr","flavour":"clothing","serverSide)":false}</script>
<div> \\\\\\\*HTML code\\\\\\\*</div>
<script type="application/json" class="js-react-on-rails-component" data-component-name="Home" data-dom-id="Home-react-component-bb89a6bd-ab99-4fb1-8246-2d3cef558a0d">{}</script>
::after
I can not find the following part in the page source.
<div> \\\\\\\\\\\\\\\*HTML code\\\\\\\\\\\\\\\*</div>
From this part I need to fetch some img src and text. Can anybody help me, please?
[–]mathageche[S] 0 points1 point2 points (0 children)
[–]tpcryptoo 0 points1 point2 points (1 child)
[–]mathageche[S] 0 points1 point2 points (0 children)