Puppeteer - NodeJS Scraping: Unterschied zwischen den Versionen
Aus Wikizone
| Zeile 36: | Zeile 36: | ||
== Beispiel Skripte == | == Beispiel Skripte == | ||
| + | Hinweis: Da die Skripte in diesem Setup keine ES Module sind, gab es bei mir Probleme in Node wenn man die Strichpunkte weglässt. | ||
DOM Element auslesen | DOM Element auslesen | ||
<syntaxhighlight lang="javascript"> | <syntaxhighlight lang="javascript"> | ||
| Zeile 70: | Zeile 71: | ||
</syntaxhighlight> | </syntaxhighlight> | ||
| + | Komplexere DOM-Zugriffe | ||
<syntaxhighlight lang="javascript"> | <syntaxhighlight lang="javascript"> | ||
| + | const puppeteer = require("puppeteer"); | ||
| + | (async () => { | ||
| + | const browser = await puppeteer.launch({headless: false}); // launch can launch headless or with displaying | ||
| + | const page = await browser.newPage(); // open new tab in browser | ||
| + | await page.goto("https://quotes.toscrape.com/"); | ||
| + | |||
| + | const grab = await page.evaluate( () => { | ||
| + | let arrElements = []; | ||
| + | const quotes = document.querySelectorAll(".quote"); | ||
| + | quotes.forEach( (quote) => { | ||
| + | const quoteSpans = quote.querySelectorAll("span"); | ||
| + | const quoteText = quoteSpans[0].innerHTML; | ||
| + | const quoteAuthor = quoteSpans[1].querySelector("small").innerHTML; | ||
| + | arrElements.push({'quote': quoteText, 'author': quoteAuthor}); | ||
| + | }); | ||
| + | return arrElements; | ||
| + | }); | ||
| + | |||
| + | console.log(grab); | ||
| + | await browser.close(); | ||
| + | }) (); | ||
</syntaxhighlight> | </syntaxhighlight> | ||
Version vom 17. August 2022, 15:55 Uhr
Quickstart
https://www.youtube.com/watch?v=Sag-Hz9jJNg
Voraussetzung: VisualStudioCode, NodeJS installiert
Ordner erstellen und NodeJS Projekt starten
Terminal
npm init -y npm install puppeteer
Installiert auch Chromium. Schau mal in die
index.js erstellen. Puppeteer laden mit asynchroner Funktion. Diese Funktion
const puppeteer = require("puppeteer");
(async () => {
}) ();
Beispiel Screenshot von Seite anfertigen:
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({headless: false}) // launch can launch headless or with displaying
const page = await browser.newPage() // open new tab in browser
await page.goto("https://schlegel.media")
await page.screenshot({path: "screenshot.png"})
await browser.close()
}) ();
Starten mit
node index.js
Beispiel Skripte
Hinweis: Da die Skripte in diesem Setup keine ES Module sind, gab es bei mir Probleme in Node wenn man die Strichpunkte weglässt. DOM Element auslesen
const puppeteer = require("puppeteer")
(async () => {
const browser = await puppeteer.launch({headless: false}) // launch can launch headless or with displaying
const page = await browser.newPage() // open new tab in browser
await page.goto("https://schlegel.media")
const grabSlogan = await page.evaluate( () => {
const slogan = document.querySelector(".uk-text-lead")
//return slogan.innerHTML // with html tags
return slogan.innerText // only the text
})
console.log(grabSlogan)
await browser.close()
}) ()
// grab multiple elements
//... wie oben
const grabList = await page.evaluate( () => {
const listTags = document.querySelectorAll(".uk-nav-default li")
let listItems = []
listTags.forEach((tag) => {
listItems.push(tag.innerText)
})
return listItems
})
console.log(grabList)
Komplexere DOM-Zugriffe
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({headless: false}); // launch can launch headless or with displaying
const page = await browser.newPage(); // open new tab in browser
await page.goto("https://quotes.toscrape.com/");
const grab = await page.evaluate( () => {
let arrElements = [];
const quotes = document.querySelectorAll(".quote");
quotes.forEach( (quote) => {
const quoteSpans = quote.querySelectorAll("span");
const quoteText = quoteSpans[0].innerHTML;
const quoteAuthor = quoteSpans[1].querySelector("small").innerHTML;
arrElements.push({'quote': quoteText, 'author': quoteAuthor});
});
return arrElements;
});
console.log(grab);
await browser.close();
}) ();