Puppeteer - NodeJS Scraping: Unterschied zwischen den Versionen
| (15 dazwischenliegende Versionen von 4 Benutzern werden nicht angezeigt) | |||
| Zeile 1: | Zeile 1: | ||
| + | == Puppeteer == | ||
| + | Puppeteer Hauptseite. | ||
| + | |||
== Links == | == Links == | ||
| + | https://pptr.dev/ | ||
https://www.youtube.com/watch?v=CngYXf9aeg8&list=PLGreOtbNU07rDURvnQpDaT3XokxlranUQ | https://www.youtube.com/watch?v=CngYXf9aeg8&list=PLGreOtbNU07rDURvnQpDaT3XokxlranUQ | ||
https://blog.risingstack.com/pdf-from-html-node-js-puppeteer/ | https://blog.risingstack.com/pdf-from-html-node-js-puppeteer/ | ||
| + | https://advancedweb.hu/how-to-speed-up-puppeteer-scraping-with-parallelization/ | ||
| + | https://jsoverson.medium.com/using-chrome-devtools-protocol-with-puppeteer-737a1300bac0 * | ||
== Quickstart == | == Quickstart == | ||
https://www.youtube.com/watch?v=Sag-Hz9jJNg | https://www.youtube.com/watch?v=Sag-Hz9jJNg | ||
Voraussetzung: VisualStudioCode, NodeJS installiert | Voraussetzung: VisualStudioCode, NodeJS installiert | ||
| − | Ordner erstellen und NodeJS Projekt starten | + | === Ordner erstellen und NodeJS Projekt starten === |
| − | Terminal | + | '''Terminal''' |
npm init -y | npm init -y | ||
npm install puppeteer | npm install puppeteer | ||
| Zeile 14: | Zeile 20: | ||
Installiert auch Chromium. Schau mal in die package.json | Installiert auch Chromium. Schau mal in die package.json | ||
| + | ==== Zusätzliche Module ==== | ||
| + | Oft benötigt man zusäzliche Module z.b. zum csv Parsen oder Zugriff auf das Filesystem. Diese einfach mit npm installieren: | ||
| + | npm install csv-parser | ||
| + | npm install fs | ||
| + | usw. | ||
| + | Dann kann man sie im Skript einbinden wie Pupeteer. | ||
| + | const csv = require('csv-parser'); | ||
| + | const fs = require('fs'); | ||
| + | const puppeteer = require("puppeteer"); | ||
| + | |||
| + | === Grundstruktur === | ||
Als Basis kommt fast immer ein Konstrukt ähnlich dem folgenden zum Einsatz. Im Wesentlichen passiert folgendes: | Als Basis kommt fast immer ein Konstrukt ähnlich dem folgenden zum Einsatz. Im Wesentlichen passiert folgendes: | ||
| Zeile 29: | Zeile 46: | ||
</syntaxhighlight> | </syntaxhighlight> | ||
| − | Beispiel Screenshot von Seite anfertigen | + | === Beispiel Screenshot von Seite anfertigen === |
<syntaxhighlight lang="javascript"> | <syntaxhighlight lang="javascript"> | ||
const puppeteer = require("puppeteer"); | const puppeteer = require("puppeteer"); | ||
| Zeile 42: | Zeile 59: | ||
</syntaxhighlight> | </syntaxhighlight> | ||
| − | Starten mit | + | '''Starten mit''' |
node index.js | node index.js | ||
| Zeile 235: | Zeile 252: | ||
} | } | ||
</syntaxhighlight> | </syntaxhighlight> | ||
| + | |||
| + | === Input- und Output-Files === | ||
| + | https://github.com/Zrce/puppeteer-coverage-report-test/blob/master/index.js | ||
| + | https://stackoverflow.com/questions/59981135/puppeteer-iterate-over-a-csv-file-and-screenshot-for-each-row | ||
| + | |||
| + | '''CSV Datei mit URLs abarbeiten - seriell/parallel''' | ||
| + | [[Puppeteer - CSV Datei sequentiell / parallel a abarbeiten (Beispiel)]] | ||
| + | |||
| + | <syntaxhighlight lang="javascript"> | ||
| + | |||
| + | </syntaxhighlight> | ||
| + | |||
| + | === Bilder und Screenshots === | ||
| + | ==== Screenshots in pdf einbetten ==== | ||
| + | * Voraussetzung sind diese Flags für den Filezugriff: ''--allow-file-access-from-files, --enable-local-file-accesses'' | ||
| + | * Screenshot anfertigen | ||
| + | * Base64 codieren | ||
| + | * Einbetten | ||
| + | <syntaxhighlight lang="javascript"> | ||
| + | |||
| + | </syntaxhighlight> | ||
| + | |||
| + | |||
| + | ==== Bilder aus DOM speichern und in pdf umwandeln ==== | ||
| + | https://stackoverflow.com/questions/59677228/convert-screenshot-to-pdf-in-puppeteer | ||
| + | <syntaxhighlight lang="javascript"> | ||
| + | await page.goto('https://www.chromestatus.com/samples', {waitUntil: 'networkidle0'}); | ||
| + | |||
| + | async function screenshotDOMElement(opts = {}) { | ||
| + | const padding = 'padding' in opts ? opts.padding : 0; | ||
| + | const path = 'path' in opts ? opts.path : null; | ||
| + | const selector = opts.selector; | ||
| + | |||
| + | if (!selector) | ||
| + | throw Error('Please provide a selector.'); | ||
| + | |||
| + | const rect = await page.evaluate(selector => { | ||
| + | const element = document.querySelector(selector); | ||
| + | if (!element) | ||
| + | return null; | ||
| + | const {x, y, width, height} = element.getBoundingClientRect(); | ||
| + | return {left: x, top: y, width, height, id: element.id}; | ||
| + | }, selector); | ||
| + | |||
| + | if (!rect) | ||
| + | throw Error(`Could not find element that matches selector: ${selector}.`); | ||
| + | |||
| + | return await page.screenshot({ | ||
| + | path, | ||
| + | clip: { | ||
| + | x: rect.left - padding, | ||
| + | y: rect.top - padding, | ||
| + | width: rect.width + padding * 2, | ||
| + | height: rect.height + padding * 2 | ||
| + | } | ||
| + | }); | ||
| + | } | ||
| + | |||
| + | await screenshotDOMElement({ | ||
| + | path: 'element.png', | ||
| + | selector: 'header aside', | ||
| + | padding: 16 | ||
| + | }); | ||
| + | |||
| + | browser.close(); | ||
| + | captureDomTOoPDF(); | ||
| + | })(); | ||
| + | |||
| + | |||
| + | function captureDomTOoPDF(){ | ||
| + | (async () => { | ||
| + | const browser = await puppeteer.launch({args: ['--allow-file-access-from-files', '--enable-local-file-accesses']}); | ||
| + | const page = await browser.newPage(); | ||
| + | const image = 'data:image/png;base64,' + base64Encode('element.png'); | ||
| + | await page.goto(image, {waitUntil: 'networkidle0'}); | ||
| + | await page.pdf({path: 'output.pdf', format: 'A4'}); | ||
| + | |||
| + | await browser.close(); | ||
| + | console.log("done"); | ||
| + | })(); | ||
| + | } | ||
| + | </syntaxhighlight> | ||
| + | |||
| + | === Zertifikate und Puppeteer === | ||
| + | [[Puppeteer - Zertifikate handeln]] | ||
| + | |||
| + | === Network Request Control === | ||
| + | https://github.com/puppeteer/puppeteer/blob/main/examples/block-images.js | ||
| + | [[Puppeteer - RequestInterception]] | ||
Aktuelle Version vom 7. Dezember 2022, 14:16 Uhr
Puppeteer[Bearbeiten]
Puppeteer Hauptseite.
Links[Bearbeiten]
https://pptr.dev/ https://www.youtube.com/watch?v=CngYXf9aeg8&list=PLGreOtbNU07rDURvnQpDaT3XokxlranUQ https://blog.risingstack.com/pdf-from-html-node-js-puppeteer/ https://advancedweb.hu/how-to-speed-up-puppeteer-scraping-with-parallelization/ https://jsoverson.medium.com/using-chrome-devtools-protocol-with-puppeteer-737a1300bac0 *
Quickstart[Bearbeiten]
https://www.youtube.com/watch?v=Sag-Hz9jJNg
Voraussetzung: VisualStudioCode, NodeJS installiert
Ordner erstellen und NodeJS Projekt starten[Bearbeiten]
Terminal
npm init -y npm install puppeteer
Installiert auch Chromium. Schau mal in die package.json
Zusätzliche Module[Bearbeiten]
Oft benötigt man zusäzliche Module z.b. zum csv Parsen oder Zugriff auf das Filesystem. Diese einfach mit npm installieren:
npm install csv-parser npm install fs
usw. Dann kann man sie im Skript einbinden wie Pupeteer.
const csv = require('csv-parser');
const fs = require('fs');
const puppeteer = require("puppeteer");
Grundstruktur[Bearbeiten]
Als Basis kommt fast immer ein Konstrukt ähnlich dem folgenden zum Einsatz. Im Wesentlichen passiert folgendes:
index.js erstellen. Puppeteer laden mit asynchroner Funktion. Diese Funktion
const puppeteer = require("puppeteer"); //pup
(async () => {
const browser = await puppeteer.launch({headless: true}); // open a new browser - headless (default) or with displaying
const page = await browser.newPage(); // open a new tab
await page.goto("https://schlegel.media/"); // navigate to a url
// do s.th.
await browser.close(); // close the browser
}) ();
Beispiel Screenshot von Seite anfertigen[Bearbeiten]
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({headless: false}) // launch can launch headless or with displaying
const page = await browser.newPage() // open new tab in browser
await page.goto("https://schlegel.media")
await page.screenshot({path: "screenshot.png"})
await browser.close()
}) ();
Starten mit
node index.js
Beispiel Skripte[Bearbeiten]
Hinweis: Da die Skripte in diesem Setup keine ES Module sind, gab es bei mir Probleme in Node wenn man die Strichpunkte weglässt.
DOM Elemente scrapen mit evaluate[Bearbeiten]
Zum Scrapen bietet sich die evaluate Funk
const puppeteer = require("puppeteer")
(async () => {
const browser = await puppeteer.launch({headless: false}) // launch can launch headless or with displaying
const page = await browser.newPage() // open new tab in browser
await page.goto("https://schlegel.media")
const grabSlogan = await page.evaluate( () => {
const slogan = document.querySelector(".uk-text-lead")
//return slogan.innerHTML // with html tags
return slogan.innerText // only the text
})
console.log(grabSlogan)
await browser.close()
}) ()
// grab multiple elements
//... wie oben
const grabList = await page.evaluate( () => {
const listTags = document.querySelectorAll(".uk-nav-default li")
let listItems = []
listTags.forEach((tag) => {
listItems.push(tag.innerText)
})
return listItems
})
console.log(grabList)
Komplexere DOM-Zugriffe
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({headless: false}); // launch can launch headless or with displaying
const page = await browser.newPage(); // open new tab in browser
await page.goto("https://quotes.toscrape.com/");
const grab = await page.evaluate( () => {
let arrElements = [];
const quotes = document.querySelectorAll(".quote");
quotes.forEach( (quote) => {
const quoteSpans = quote.querySelectorAll("span");
const quoteText = quoteSpans[0].innerHTML;
const quoteAuthor = quoteSpans[1].querySelector("small").innerHTML;
arrElements.push({'quote': quoteText, 'author': quoteAuthor});
});
return arrElements;
});
console.log(grab);
await browser.close();
}) ();
User actions simulieren[Bearbeiten]
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({headless: false}); // launch can launch headless or with displaying
const page = await browser.newPage(); // open new tab in browser
await page.goto("https://quotes.toscrape.com/");
await page.click('a[href="/login"]'); // click login link
await page.type('#username','myUserName',{delay:300});
await page.type('#password','mySecret');
await page.click('input[type="submit"]');
//await browser.close();
}) ();
Computed Styles von DOM Elementen auslesen[Bearbeiten]
Styles eines DOM Elements finden. Hier nutzen wir mal die $eval Funktion.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({headless: true}); // launch can launch headless or with displaying
const page = await browser.newPage(); // open new tab in browser
await page.goto("https://schlegel.media/");
// get styles of element
const myStyles = await page.$eval('body', el => getComputedStyle(el).getPropertyValue('font-family')
);
console.log(myStyles);
await browser.close();
}) ();
Hinweis: Handle Functions sind nicht so performant aber eher menschenähnlich. Bei einem Klick würde der Browser tatsächlich die Maus bewegen statt einfach einen Klick Event zu senden.
Evaluate Version - besser zu debuggen Unterschiede in der Ausführung. Siehe: https://stackoverflow.com/questions/55664420/page-evaluate-vs-puppeteer-methods
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({headless: true}); // launch can launch headless or with displaying
const page = await browser.newPage(); // open new tab in browser
await page.goto("https://schlegel.media/");
// get styles of element
const getStyles = await page.evaluate( () =>{
const el = document.querySelector('body');
const myStyle = getComputedStyle(el).getPropertyValue('font-family');
return myStyle
});
console.log(getStyles);
await browser.close();
}) ();
So kann man alle Styles auslesen:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({headless: true}); // launch can launch headless or with displaying
const page = await browser.newPage(); // open new tab in browser
await page.goto("https://schlegel.media/");
// get styles of element
const getStyles = await page.evaluate( () =>{
const el = document.querySelector('p');
//const myStyle = getComputedStyle(el).getPropertyValue('font-family'); // get a specific style
const stylesObject = getComputedStyle(el);
const myStyles = {};
for (const prop in stylesObject) {
if(stylesObject.hasOwnProperty(prop)){ // filter out
myStyles[prop] = stylesObject[prop];
}
}
//return myStyle;
return myStyles;
});
console.log(getStyles);
await browser.close();
}) ();
PDF generieren[Bearbeiten]
Siehe auch
https://blog.risingstack.com/pdf-from-html-node-js-puppeteer/
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({headless: true}); // launch can launch headless or with displaying
const page = await browser.newPage(); // open new tab in browser
await page.goto("https://schlegel.media/");
// print pdf
await page.pdf({
path: "myWebsite.pdf", // mandatory - rest ist optional
format: 'A4', // default is letter
margin: {
top: '100px',
bottom: '100px'
},
printBackground: true,
displayHeaderFooter: true,
headerTemplate: `<p style="font-size: 10px; font-family: Arial, Helvetica, sans-serif; margin: 0 auto;"><span class="title"></span></p>`,
footerTemplate: `<p style="font-size:10px; font-family: Arial, Helvetica, sans-serif; margin: 0 auto;"><span class="pageNumber"></span> of <span class="totalPages"></span></p>`
})
await browser.close();
}) ();
Crawl multiple pages[Bearbeiten]
https://stackoverflow.com/questions/46293216/crawling-multiple-urls-in-a-loop-using-puppeteer
page.setDefaultNavigationTimeout(0); // prevent timeout after 30s.
//...
urls = ['url','url','url'...]
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
await page.goto(`${url}`);
await page.waitForNavigation({ waitUntil: 'networkidle2' });
}
Input- und Output-Files[Bearbeiten]
https://github.com/Zrce/puppeteer-coverage-report-test/blob/master/index.js https://stackoverflow.com/questions/59981135/puppeteer-iterate-over-a-csv-file-and-screenshot-for-each-row
CSV Datei mit URLs abarbeiten - seriell/parallel Puppeteer - CSV Datei sequentiell / parallel a abarbeiten (Beispiel)
Bilder und Screenshots[Bearbeiten]
Screenshots in pdf einbetten[Bearbeiten]
- Voraussetzung sind diese Flags für den Filezugriff: --allow-file-access-from-files, --enable-local-file-accesses
- Screenshot anfertigen
- Base64 codieren
- Einbetten
Bilder aus DOM speichern und in pdf umwandeln[Bearbeiten]
https://stackoverflow.com/questions/59677228/convert-screenshot-to-pdf-in-puppeteer
await page.goto('https://www.chromestatus.com/samples', {waitUntil: 'networkidle0'});
async function screenshotDOMElement(opts = {}) {
const padding = 'padding' in opts ? opts.padding : 0;
const path = 'path' in opts ? opts.path : null;
const selector = opts.selector;
if (!selector)
throw Error('Please provide a selector.');
const rect = await page.evaluate(selector => {
const element = document.querySelector(selector);
if (!element)
return null;
const {x, y, width, height} = element.getBoundingClientRect();
return {left: x, top: y, width, height, id: element.id};
}, selector);
if (!rect)
throw Error(`Could not find element that matches selector: ${selector}.`);
return await page.screenshot({
path,
clip: {
x: rect.left - padding,
y: rect.top - padding,
width: rect.width + padding * 2,
height: rect.height + padding * 2
}
});
}
await screenshotDOMElement({
path: 'element.png',
selector: 'header aside',
padding: 16
});
browser.close();
captureDomTOoPDF();
})();
function captureDomTOoPDF(){
(async () => {
const browser = await puppeteer.launch({args: ['--allow-file-access-from-files', '--enable-local-file-accesses']});
const page = await browser.newPage();
const image = 'data:image/png;base64,' + base64Encode('element.png');
await page.goto(image, {waitUntil: 'networkidle0'});
await page.pdf({path: 'output.pdf', format: 'A4'});
await browser.close();
console.log("done");
})();
}
Zertifikate und Puppeteer[Bearbeiten]
Puppeteer - Zertifikate handeln
Network Request Control[Bearbeiten]
https://github.com/puppeteer/puppeteer/blob/main/examples/block-images.js Puppeteer - RequestInterception