Puppeteer - NodeJS Scraping: Unterschied zwischen den Versionen

Aus Wikizone
Wechseln zu: Navigation, Suche
Zeile 239: Zeile 239:
 
  https://github.com/Zrce/puppeteer-coverage-report-test/blob/master/index.js
 
  https://github.com/Zrce/puppeteer-coverage-report-test/blob/master/index.js
 
  https://stackoverflow.com/questions/59981135/puppeteer-iterate-over-a-csv-file-and-screenshot-for-each-row
 
  https://stackoverflow.com/questions/59981135/puppeteer-iterate-over-a-csv-file-and-screenshot-for-each-row
 +
 +
'''CSV Datei mit URLs abarbeiten und Screenshots anfertigen'''
 +
<syntaxhighlight lang="javascript">
 +
const csv = require('csv-parser');
 +
const fs = require('fs');
 +
const puppeteer = require('puppeteer');
 +
 +
(async () => {
 +
  const browser = await puppeteer.launch();
 +
 +
  // function executed for each url in csv
 +
  const getFile = async (url, path) => {
 +
    const page = await browser.newPage();
 +
    page.setViewport({ width: 1280, height: 800, deviceScaleFactor: 1 });
 +
    const response = await page.goto(url, { waitUntil: 'networkidle2' }); // wait until options load, domcontentloaded networkidle0, networkidle2
 +
    const body = await page.$('body');
 +
    await body.screenshot({path: path });
 +
    page.close();
 +
  };
 +
 +
  // read csv and call getFile for each url
 +
  let fname = 'urls.csv';
 +
  let c = 0;
 +
  const csvPipe = fs.createReadStream(fname).pipe(csv());
 +
  csvPipe.on('data', async (row) => {
 +
    c++;
 +
    let id = row.url;
 +
    console.log('url: ' + id);    // evt. check if url is valid
 +
    let path = './images/' + c + '.png';
 +
    csvPipe.pause();
 +
    await getFile(id, path);
 +
    csvPipe.resume();
 +
  }).on('end', () => {
 +
    console.log('CSV file successfully processed');
 +
  });
 +
})();
 +
</syntaxhighlight>

Version vom 18. August 2022, 14:56 Uhr

Links

https://www.youtube.com/watch?v=CngYXf9aeg8&list=PLGreOtbNU07rDURvnQpDaT3XokxlranUQ
https://blog.risingstack.com/pdf-from-html-node-js-puppeteer/

Quickstart

https://www.youtube.com/watch?v=Sag-Hz9jJNg

Voraussetzung: VisualStudioCode, NodeJS installiert

Ordner erstellen und NodeJS Projekt starten

Terminal

npm init -y
npm install puppeteer

Installiert auch Chromium. Schau mal in die package.json

Als Basis kommt fast immer ein Konstrukt ähnlich dem folgenden zum Einsatz. Im Wesentlichen passiert folgendes:

index.js erstellen. Puppeteer laden mit asynchroner Funktion. Diese Funktion

const puppeteer = require("puppeteer"); //pup
(async () => {
  const browser = await puppeteer.launch({headless: true}); // open a new browser - headless (default) or with displaying
  const page = await browser.newPage();  // open a new tab
  await page.goto("https://schlegel.media/"); // navigate to a url
  // do s.th.
  await browser.close(); // close the browser
}) ();

Beispiel Screenshot von Seite anfertigen:

const puppeteer = require("puppeteer");
(async () => {
  const browser = await puppeteer.launch({headless: false}) // launch can launch headless or with displaying
  const page = await browser.newPage() // open new tab in browser
  await page.goto("https://schlegel.media")
  await page.screenshot({path: "screenshot.png"})

  await browser.close()
}) ();

Starten mit

node index.js

Beispiel Skripte

Hinweis: Da die Skripte in diesem Setup keine ES Module sind, gab es bei mir Probleme in Node wenn man die Strichpunkte weglässt.

DOM Elemente scrapen mit evaluate

Zum Scrapen bietet sich die evaluate Funk

const puppeteer = require("puppeteer")
(async () => {
  const browser = await puppeteer.launch({headless: false}) // launch can launch headless or with displaying
  const page = await browser.newPage() // open new tab in browser
  await page.goto("https://schlegel.media")

  const grabSlogan = await page.evaluate( () => {
    const slogan = document.querySelector(".uk-text-lead")
    //return slogan.innerHTML // with html tags
    return slogan.innerText // only the text
  })

  console.log(grabSlogan)
  await browser.close()
}) ()

// grab multiple elements

//... wie oben
  const grabList = await page.evaluate( () => {
    const listTags = document.querySelectorAll(".uk-nav-default li")
    let listItems = []
    listTags.forEach((tag) => {
      listItems.push(tag.innerText)
    })

    return listItems
  })
  console.log(grabList)

Komplexere DOM-Zugriffe

const puppeteer = require("puppeteer");
(async () => {
  const browser = await puppeteer.launch({headless: false}); // launch can launch headless or with displaying
  const page = await browser.newPage(); // open new tab in browser
  await page.goto("https://quotes.toscrape.com/");

  const grab = await page.evaluate( () => {
    let arrElements = [];
    const quotes = document.querySelectorAll(".quote");
    quotes.forEach( (quote) => {
      const quoteSpans = quote.querySelectorAll("span");
      const quoteText = quoteSpans[0].innerHTML;
      const quoteAuthor = quoteSpans[1].querySelector("small").innerHTML;
      arrElements.push({'quote': quoteText, 'author': quoteAuthor});
    });
    return arrElements;
  });

  console.log(grab);
  await browser.close();
}) ();

User actions simulieren

const puppeteer = require("puppeteer");
(async () => {
  const browser = await puppeteer.launch({headless: false}); // launch can launch headless or with displaying
  const page = await browser.newPage(); // open new tab in browser
  await page.goto("https://quotes.toscrape.com/");

  await page.click('a[href="/login"]'); // click login link
  await page.type('#username','myUserName',{delay:300});
  await page.type('#password','mySecret');
  await page.click('input[type="submit"]');
  //await browser.close();
}) ();

Computed Styles von DOM Elementen auslesen

Styles eines DOM Elements finden. Hier nutzen wir mal die $eval Funktion.

const puppeteer = require("puppeteer");
(async () => {

  const browser = await puppeteer.launch({headless: true}); // launch can launch headless or with displaying
  const page = await browser.newPage(); // open new tab in browser
  await page.goto("https://schlegel.media/");

  // get styles of element
  const myStyles = await page.$eval('body', el => getComputedStyle(el).getPropertyValue('font-family')
  );
  console.log(myStyles);

  await browser.close();
}) ();

Hinweis: Handle Functions sind nicht so performant aber eher menschenähnlich. Bei einem Klick würde der Browser tatsächlich die Maus bewegen statt einfach einen Klick Event zu senden.

Evaluate Version - besser zu debuggen Unterschiede in der Ausführung. Siehe: https://stackoverflow.com/questions/55664420/page-evaluate-vs-puppeteer-methods

const puppeteer = require("puppeteer");
(async () => {
  const browser = await puppeteer.launch({headless: true}); // launch can launch headless or with displaying
  const page = await browser.newPage(); // open new tab in browser
  await page.goto("https://schlegel.media/");

  // get styles of element
  const getStyles = await page.evaluate( () =>{
    const el = document.querySelector('body');
    const myStyle = getComputedStyle(el).getPropertyValue('font-family');
    return myStyle
  });
  console.log(getStyles);

  await browser.close();
}) ();

So kann man alle Styles auslesen:

const puppeteer = require("puppeteer");
(async () => {

  const browser = await puppeteer.launch({headless: true}); // launch can launch headless or with displaying
  const page = await browser.newPage(); // open new tab in browser
  await page.goto("https://schlegel.media/");

  // get styles of element
  const getStyles = await page.evaluate( () =>{
    const el = document.querySelector('p');
    //const myStyle = getComputedStyle(el).getPropertyValue('font-family'); // get a specific style
    const stylesObject = getComputedStyle(el);
    const myStyles = {};
    for (const prop in stylesObject) {
      if(stylesObject.hasOwnProperty(prop)){ // filter out 
        myStyles[prop] = stylesObject[prop];
      }
    }
    //return myStyle;
    return myStyles;
  });
  console.log(getStyles);

  await browser.close();
}) ();

PDF generieren

Siehe auch

https://blog.risingstack.com/pdf-from-html-node-js-puppeteer/
const puppeteer = require("puppeteer");
(async () => {

  const browser = await puppeteer.launch({headless: true}); // launch can launch headless or with displaying
  const page = await browser.newPage(); // open new tab in browser
  await page.goto("https://schlegel.media/");

  // print pdf
  await page.pdf({
    path: "myWebsite.pdf", // mandatory - rest ist optional
    format: 'A4', // default is letter
    margin: {
      top: '100px',
      bottom: '100px'
    },
    printBackground: true,
    displayHeaderFooter: true,
    headerTemplate: `<p style="font-size: 10px; font-family: Arial, Helvetica, sans-serif; margin: 0 auto;"><span class="title"></span></p>`,
    footerTemplate: `<p style="font-size:10px; font-family: Arial, Helvetica, sans-serif; margin: 0 auto;"><span class="pageNumber"></span> of <span class="totalPages"></span></p>`
  })

  await browser.close();
}) ();

Crawl multiple pages

https://stackoverflow.com/questions/46293216/crawling-multiple-urls-in-a-loop-using-puppeteer
page.setDefaultNavigationTimeout(0); // prevent timeout after 30s.
//...
urls = ['url','url','url'...]

for (let i = 0; i < urls.length; i++) {
    const url = urls[i];
    await page.goto(`${url}`);
    await page.waitForNavigation({ waitUntil: 'networkidle2' });
}

Input- und Output-Files

https://github.com/Zrce/puppeteer-coverage-report-test/blob/master/index.js
https://stackoverflow.com/questions/59981135/puppeteer-iterate-over-a-csv-file-and-screenshot-for-each-row

CSV Datei mit URLs abarbeiten und Screenshots anfertigen

const csv = require('csv-parser');
const fs = require('fs');
const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();

  // function executed for each url in csv
  const getFile = async (url, path) => {
    const page = await browser.newPage();
    page.setViewport({ width: 1280, height: 800, deviceScaleFactor: 1 });
    const response = await page.goto(url, { waitUntil: 'networkidle2' }); // wait until options load, domcontentloaded networkidle0, networkidle2
    const body = await page.$('body');
    await body.screenshot({path: path });
    page.close();
  };

  // read csv and call getFile for each url
  let fname = 'urls.csv';
  let c = 0;
  const csvPipe = fs.createReadStream(fname).pipe(csv());
  csvPipe.on('data', async (row) => {
    c++;
    let id = row.url;
    console.log('url: ' + id);     // evt. check if url is valid
    let path = './images/' + c + '.png';
    csvPipe.pause();
    await getFile(id, path);
    csvPipe.resume();
  }).on('end', () => {
    console.log('CSV file successfully processed');
  });
})();