Puppeteer - CSV Datei sequentiell / parallel a abarbeiten (Beispiel)

Aus Wikizone
Wechseln zu: Navigation, Suche

Wenn man in Puppeteer Bulk Aufgaben hat, kann man leicht durcheinanderkommen, denn an vielen Stellen mischen sich asyncrone mit synchron abgearbeiteten Funktionen.

Links[Bearbeiten]

https://blog.logrocket.com/complete-guide-csv-files-node-js/

Beispiele[Bearbeiten]

Datei lesen

// read csv and write in urls array
  const fs = require("fs");
  fs.readFile("urls.csv", "utf-8", (err, data) => {
    if (err) console.log(err);
    else console.log(data);
  });

CSV als Stream lesen und in JSON speichern

const csv = require('csv-parser')
const fs = require('fs')
const results = [];

fs.createReadStream('data.csv')
  .pipe(csv())
  .on('data', (data) => results.push(data))
  .on('end', () => {
    console.log(results);
    // [
    //   { NAME: 'Daffy Duck', AGE: '24' },
    //   { NAME: 'Bugs Bunny', AGE: '22' }
    // ]
  });


So kann man eine Liste mit Urls sysequentiell im Browser analysieren:

const csv = require('csv-parser');
const fs = require('fs');
const puppeteer = require('puppeteer');
const fname = 'urls-short.csv';
let allStyles = []; // will hold complete urls + styles

(async () => {
    const browser = await puppeteer.launch({headless:false});
    const csvPipe = fs.createReadStream(fname).pipe(csv());
    const urls = [];

    // function executed for each url in csv
    const checkPage = async (url) => {
      try {
        validURL(url);
      } catch (error) {
        allStyles.push({url:url,error:'url not valid'});
        return 'url not valid';
      }
      const page = await browser.newPage();
      page.setViewport({ width: 1280, height: 800, deviceScaleFactor: 1 });
      page.setDefaultNavigationTimeout(0);
      const response = await page.goto(url, { waitUntil: 'networkidle2' }); // wait until options load, domcontentloaded networkidle0, networkidle2
      // todo check browsers return code
      let data = await page.evaluate(() => {
        const elements = document.body.getElementsByTagName("*");
    
        return [...elements].map(element => { // return a array with computed font-family for all elements
          element.focus();
          let style = '';
          style = window.getComputedStyle(element).getPropertyValue("font-family");
          return style;
        });
      });
      page.close();
      data = uniq(data);
      return(data);
    };

    // read all rows
    csvPipe.on('data', row => {
      urls.push(row.url);
    }).on('end', async () => {

      // all urls in the array now -> process them
      console.log('urls to check:');
      console.log(urls);

      for (const url of urls){
        console.log('url:');
        console.log(url);
        if(validURL(url)){
          myStyles = [];
          myStyles = await checkPage(url);
          console.log(myStyles);
          allStyles.push({url:url,'style':myStyles});
        }
      }
    
      // note: forEach would not work here. Use for or map (for parallel execution)
      
      console.log("finished - allStyles:");
      console.log(allStyles);
      await browser.close();


    });
    


})();

function validURL(str) {
  var pattern = new RegExp('^(https?:\\/\\/)?'+ // protocol
    '((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|'+ // domain name
    '((\\d{1,3}\\.){3}\\d{1,3}))'+ // OR ip (v4) address
    '(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*'+ // port and path
    '(\\?[;&a-z\\d%_.~+=-]*)?'+ // query string
    '(\\#[-a-z\\d_]*)?$','i'); // fragment locator
  return !!pattern.test(str);
}

function uniq(a) {
  return Array.from(new Set(a));
}

Soll der Browser alle URLs parallel abarbeiten ändern wir die For Schleife und arbeiten mit map(). Map nutzt promises und so kann man Promise.all() einsetzen. So werden alle Tasks parallel gestartet und wenn alle fertig sind zum nächsten Statement gesprungen.

//...
   // all urls in the array now -> process them
      console.log('urls to check:');
      console.log(urls);

      await Promise.all(urls.map(async (url) => {
        console.log('url:');
        console.log(url);
        if(validURL(url)){
          myStyles = [];
          myStyles = await checkPage(url);
          console.log(myStyles);
          allStyles.push({url:url,'style':myStyles});
        }
      }));
// ...