68 lines
2.0 KiB
JavaScript
68 lines
2.0 KiB
JavaScript
import puppeteer from 'puppeteer';
|
|
// const startingLink = 'https://le.utah.gov/~2024/bills/static/HB0030.html';
|
|
// const startingLink = 'https://le.utah.gov/~2025/bills/static/HB0011.html';
|
|
const startingLink = 'https://le.utah.gov/~2025/bills/static/HB0012.html';
|
|
|
|
(async () => {
|
|
// Launch the browser and open a new blank page
|
|
const browser = await puppeteer.launch({
|
|
headless: false,
|
|
});
|
|
let text = await getPageText(browser, startingLink);
|
|
|
|
const lines = text.join(' ').split('. ');
|
|
|
|
console.log(lines.join('.\n'));
|
|
|
|
let totalChars = 0;
|
|
for (let line of lines) {
|
|
totalChars += line.length;
|
|
}
|
|
console.log('Total chars:', totalChars);
|
|
console.log('Total lines:', lines.length);
|
|
console.log('Average chars per line:', totalChars / lines.length);
|
|
|
|
})();
|
|
|
|
async function getPageText(browser, url) {
|
|
const page = await browser.newPage();
|
|
await page.goto(url);
|
|
|
|
const test = await page.evaluate(() => {
|
|
// ------------------- in the browser context -------------------
|
|
// Use the querySelector to target the leg element
|
|
const legElement = document.querySelector('leg');
|
|
if (legElement) {
|
|
return flattenTree(legElement);
|
|
}
|
|
return [];
|
|
|
|
function flattenTree(element) {
|
|
if (!element) return [];
|
|
|
|
// Traverse the child nodes recursively and filter content
|
|
return Array.from(element.childNodes)
|
|
.flatMap((node) => {
|
|
if (node.nodeType === Node.TEXT_NODE) {
|
|
// Collect text from text nodes
|
|
return node.textContent.trim();
|
|
} else if (node.nodeType === Node.ELEMENT_NODE && !node.classList.contains('lineno')) {
|
|
// Recursively include elements that are not line numbers
|
|
return flattenTree(node);
|
|
}
|
|
// Ignore elements like line numbers entirely
|
|
return [];
|
|
})
|
|
.filter((text) => text.length > 0); // Filter out any leftover empty strings
|
|
}
|
|
|
|
// ------------------- in the browser context -------------------
|
|
});
|
|
|
|
await browser.close();
|
|
return test;
|
|
}
|
|
|
|
|
|
|