create initial scraping prototype
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
node_modules/
|
||||||
|
.idea/
|
5
go.mod
Normal file
5
go.mod
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
module git.sa.vin/legislature-tracker
|
||||||
|
|
||||||
|
go 1.23
|
||||||
|
|
||||||
|
require golang.org/x/net v0.33.0
|
2
go.sum
Normal file
2
go.sum
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
|
||||||
|
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
|
52
scraper/index.js
Normal file
52
scraper/index.js
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
import puppeteer from 'puppeteer';
|
||||||
|
const startingLing = 'https://le.utah.gov/~2024/bills/static/HB0030.html';
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
// Launch the browser and open a new blank page
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: false,
|
||||||
|
});
|
||||||
|
let text = await getPageText(browser, startingLing);
|
||||||
|
|
||||||
|
console.log(text);
|
||||||
|
|
||||||
|
})();
|
||||||
|
|
||||||
|
async function getPageText(browser, url) {
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.goto(url);
|
||||||
|
|
||||||
|
const test = await page.evaluate(() => {
|
||||||
|
// Use the querySelector to target the leg element
|
||||||
|
const legElement = document.querySelector('leg');
|
||||||
|
if (legElement) {
|
||||||
|
return flattenTree(legElement);
|
||||||
|
}
|
||||||
|
return [];
|
||||||
|
|
||||||
|
function flattenTree(element) {
|
||||||
|
if (!element) return [];
|
||||||
|
|
||||||
|
// Traverse the child nodes recursively and filter content
|
||||||
|
return Array.from(element.childNodes)
|
||||||
|
.flatMap((node) => {
|
||||||
|
if (node.nodeType === Node.TEXT_NODE) {
|
||||||
|
// Collect text from text nodes
|
||||||
|
return node.textContent.trim();
|
||||||
|
} else if (node.nodeType === Node.ELEMENT_NODE && !node.classList.contains('lineno')) {
|
||||||
|
// Recursively include elements that are not line numbers
|
||||||
|
return flattenTree(node);
|
||||||
|
}
|
||||||
|
// Ignore elements like line numbers entirely
|
||||||
|
return [];
|
||||||
|
})
|
||||||
|
.filter((text) => text.length > 0); // Filter out any leftover empty strings
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
return test;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
1057
scraper/package-lock.json
generated
Normal file
1057
scraper/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
9
scraper/package.json
Normal file
9
scraper/package.json
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"type": "module",
|
||||||
|
"dependencies": {
|
||||||
|
"puppeteer": "^23.11.0"
|
||||||
|
},
|
||||||
|
"scripts": {
|
||||||
|
"start": "node index.js"
|
||||||
|
}
|
||||||
|
}
|
11
todo.txt
Normal file
11
todo.txt
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
|
||||||
|
|
||||||
|
[ ] Obtain embeddings for chunks of the bills
|
||||||
|
[ ] store the chunk and embeddings in a turso DB with associated metadata for linking to the docs and updates
|
||||||
|
[ ] see if summarizing bills is useful or loses too much info to be useful
|
||||||
|
[ ] test semantic searches for relevant info
|
||||||
|
[ ] figure out how to format and render diffs for the various versions of the docs a la git diff style
|
||||||
|
[ ] profiles for users should contain enough info to allow me to build a prompt that would find relevant info for the person
|
||||||
|
[ ] allow the user to enter their own prompts
|
||||||
|
[ ] add periodic checking for updates
|
||||||
|
[ ] add notifications to users when they are subscribed to a certain search
|
Reference in New Issue
Block a user