create initial scraping prototype
This commit is contained in:
52
scraper/index.js
Normal file
52
scraper/index.js
Normal file
@ -0,0 +1,52 @@
|
||||
import puppeteer from 'puppeteer';
|
||||
const startingLing = 'https://le.utah.gov/~2024/bills/static/HB0030.html';
|
||||
|
||||
(async () => {
|
||||
// Launch the browser and open a new blank page
|
||||
const browser = await puppeteer.launch({
|
||||
headless: false,
|
||||
});
|
||||
let text = await getPageText(browser, startingLing);
|
||||
|
||||
console.log(text);
|
||||
|
||||
})();
|
||||
|
||||
async function getPageText(browser, url) {
|
||||
const page = await browser.newPage();
|
||||
await page.goto(url);
|
||||
|
||||
const test = await page.evaluate(() => {
|
||||
// Use the querySelector to target the leg element
|
||||
const legElement = document.querySelector('leg');
|
||||
if (legElement) {
|
||||
return flattenTree(legElement);
|
||||
}
|
||||
return [];
|
||||
|
||||
function flattenTree(element) {
|
||||
if (!element) return [];
|
||||
|
||||
// Traverse the child nodes recursively and filter content
|
||||
return Array.from(element.childNodes)
|
||||
.flatMap((node) => {
|
||||
if (node.nodeType === Node.TEXT_NODE) {
|
||||
// Collect text from text nodes
|
||||
return node.textContent.trim();
|
||||
} else if (node.nodeType === Node.ELEMENT_NODE && !node.classList.contains('lineno')) {
|
||||
// Recursively include elements that are not line numbers
|
||||
return flattenTree(node);
|
||||
}
|
||||
// Ignore elements like line numbers entirely
|
||||
return [];
|
||||
})
|
||||
.filter((text) => text.length > 0); // Filter out any leftover empty strings
|
||||
}
|
||||
});
|
||||
|
||||
await browser.close();
|
||||
return test;
|
||||
}
|
||||
|
||||
|
||||
|
1057
scraper/package-lock.json
generated
Normal file
1057
scraper/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
9
scraper/package.json
Normal file
9
scraper/package.json
Normal file
@ -0,0 +1,9 @@
|
||||
{
|
||||
"type": "module",
|
||||
"dependencies": {
|
||||
"puppeteer": "^23.11.0"
|
||||
},
|
||||
"scripts": {
|
||||
"start": "node index.js"
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user