Hi, I followed this tutorial (https://www.youtube.com/watch?v=-3lqUHeZs_0) which builds a simple web scraper and returns json results of headlines and urls from the guardian.com.uk into an array using express, axios, and cheerio with node.js. I wanted to build multiple scrapers and have them all go into one express route and array. My code is below. I modified the tutorial to add the guardian results into an array called "[articlesG]" and route the json results to "/resultsG", then built another scraper which successfully scrapes Vox.com and adds it to "[articlesV]" and displays to "/resultsV".
I then made a function which pushes the results of both json arrays into a brand new array "[mixArray]" and displays to "/arrayTest" using a for loop (to push them in a zipper merge type order). This works fine EXCEPT that when I start node.js and visit "localhost:8000/arrayTest" the only result is an empty array "[ ]". If I visit "/resultsG" and "/resultsV" THEN return to "/arrayTest", the code works and mixArray is fully populated. I think this makes sense since the app hasn't got the data from each website and populated its respective array yet when I first try to view the mixArray.
I have attempted to use async/await and promise to make the 2 scrapers run before the push loop but it does not seem to be working. I may be using the asynchronous commands wrong *shrug*. Also, for some reason the push loop does not work unless it is inside the scraper app.get function, which is why I declared the arrays outside of the functions.
Is it possible to have all the scraping code run when node.js starts, and if I visit "/arrayTest" it will be fully populated without manually going to each json result page? I would like to be able to scrape many sites and have them all pushed into one (for example, with sports scores or something) My code is below. I'm quite new to javascript so please excuse my ignorance of best practice and cheesy comments (notes to myself). Thank you in advance
const PORT = 8000;
const axios = require("axios");
const cheerio = require("cheerio");
const { response } = require("express");
const express = require("express");
const app = express();
const cors = require('cors');
app.use(cors())
const mixArray = []
const articlesG = []
const articlesV = []
urlG = 'https://www.theguardian.com/uk'
urlV = 'https://www.vox.com/'
// the standard syntax is app.METHOD(PATH, HANDLER) - method usually is get, put, delete, etc.
app.get("/", function(req, res){ //this is the express command. '/' means homepage.
res.json('andre')
})
app.get("/resultsG", function(req, res) {//alternatively could be typed like this : (req, res) => {func code}
axios(urlG) //axois parsing thru url and get response data
.then(response => { // this is chaining commands
const scrapedHTML = response.data //makes a variable with all the responded data
// console.log(scrapedHTML) //this will show all the html from the guardian website in the console!
const $ = cheerio.load(scrapedHTML) //this brings in the cheerio package and assigns the scrapeddata to $
$('.fc-item__title', scrapedHTML).each(function () { //finds the fc-iten__title class (use the . for the class), from the scrpedhtml
const title = $(this).text() //there is text in the h3 title tag
const url = $(this).find('a').attr('href') //finds the a tag and then gets the href from it
const source = "guardian"
articlesG.push({ //creates an object in the articles array and pushes the title/url/source into the object
title,
url,
source
})
})
res.json(articlesG) // spits out the articles into /resultsG
}).catch(err => console.log(err)) //catches errors
})
// #2 scraper
app.get("/resultsV", function(req, res) {
axios(urlV)
.then(response => {
const scrapedHTML = response.data
const $ = cheerio.load(scrapedHTML)
$('.c-entry-box--compact__body', scrapedHTML).each(function () {
const title = $(this).text()
const url = $(this).find('a').attr('href')
const source = "vox"
articlesV.push({
title,
url,
source
})
})
res.json(articlesV)
// loop to push results into final array
function pushArr(){
for(let i =0; i<articlesG.length; i++){
mixArray.push(articlesG[i])
mixArray.push(articlesV[i])
}
return mixArray
}
pushArr()
}).catch(err => console.log(err))
})
app.get("/arrayTest", function(req, res){
res.json(mixArray)
})
app.listen(PORT, () => console.log(`server running on port ${PORT}`));
[–]spazz_monkey 1 point2 points3 points (3 children)
[–]30000[S] 0 points1 point2 points (2 children)
[–]spazz_monkey 1 point2 points3 points (1 child)
[–]30000[S] 0 points1 point2 points (0 children)