create initial scraping prototype

This commit is contained in:
2024-12-19 00:02:48 -07:00
commit c4521af5c2
8 changed files with 1620 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
node_modules/
.idea/

5
go.mod Normal file
View File

@ -0,0 +1,5 @@
module git.sa.vin/legislature-tracker
go 1.23
require golang.org/x/net v0.33.0

2
go.sum Normal file
View File

@ -0,0 +1,2 @@
golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=

482
main.go Normal file

File diff suppressed because one or more lines are too long

52
scraper/index.js Normal file
View File

@ -0,0 +1,52 @@
import puppeteer from 'puppeteer';
const startingLing = 'https://le.utah.gov/~2024/bills/static/HB0030.html';
(async () => {
// Launch the browser and open a new blank page
const browser = await puppeteer.launch({
headless: false,
});
let text = await getPageText(browser, startingLing);
console.log(text);
})();
async function getPageText(browser, url) {
const page = await browser.newPage();
await page.goto(url);
const test = await page.evaluate(() => {
// Use the querySelector to target the leg element
const legElement = document.querySelector('leg');
if (legElement) {
return flattenTree(legElement);
}
return [];
function flattenTree(element) {
if (!element) return [];
// Traverse the child nodes recursively and filter content
return Array.from(element.childNodes)
.flatMap((node) => {
if (node.nodeType === Node.TEXT_NODE) {
// Collect text from text nodes
return node.textContent.trim();
} else if (node.nodeType === Node.ELEMENT_NODE && !node.classList.contains('lineno')) {
// Recursively include elements that are not line numbers
return flattenTree(node);
}
// Ignore elements like line numbers entirely
return [];
})
.filter((text) => text.length > 0); // Filter out any leftover empty strings
}
});
await browser.close();
return test;
}

1057
scraper/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

9
scraper/package.json Normal file
View File

@ -0,0 +1,9 @@
{
"type": "module",
"dependencies": {
"puppeteer": "^23.11.0"
},
"scripts": {
"start": "node index.js"
}
}

11
todo.txt Normal file
View File

@ -0,0 +1,11 @@
[ ] Obtain embeddings for chunks of the bills
[ ] store the chunk and embeddings in a turso DB with associated metadata for linking to the docs and updates
[ ] see if summarizing bills is useful or loses too much info to be useful
[ ] test semantic searches for relevant info
[ ] figure out how to format and render diffs for the various versions of the docs a la git diff style
[ ] profiles for users should contain enough info to allow me to build a prompt that would find relevant info for the person
[ ] allow the user to enter their own prompts
[ ] add periodic checking for updates
[ ] add notifications to users when they are subscribed to a certain search