create initial scraping prototype

2024-12-19 00:02:48 -07:00
commit c4521af5c2
8 changed files with 1620 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 node_modules/
 .idea/
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,5 @@
 module git.sa.vin/legislature-tracker
 go 1.23
 require golang.org/x/net v0.33.0
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,2 @@
 golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
 golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
--- a/main.go
+++ b/main.go
--- a/scraper/index.js
+++ b/scraper/index.js
@ -0,0 +1,52 @@
 import puppeteer from 'puppeteer';
 const startingLing = 'https://le.utah.gov/~2024/bills/static/HB0030.html';
 (async () => {
  // Launch the browser and open a new blank page
  const browser = await puppeteer.launch({
    headless: false,
  });
  let text = await getPageText(browser, startingLing);
  console.log(text);
 })();
 async function getPageText(browser, url) {
  const page = await browser.newPage();
  await page.goto(url);
  const test = await page.evaluate(() => {
    // Use the querySelector to target the leg element
    const legElement = document.querySelector('leg');
    if (legElement) {
      return flattenTree(legElement);
    }
    return [];
    function flattenTree(element) {
      if (!element) return [];
      // Traverse the child nodes recursively and filter content
      return Array.from(element.childNodes)
        .flatMap((node) => {
          if (node.nodeType === Node.TEXT_NODE) {
            // Collect text from text nodes
            return node.textContent.trim();
          } else if (node.nodeType === Node.ELEMENT_NODE && !node.classList.contains('lineno')) {
            // Recursively include elements that are not line numbers
            return flattenTree(node);
          }
          // Ignore elements like line numbers entirely
          return [];
        })
        .filter((text) => text.length > 0); // Filter out any leftover empty strings
    }
  });
  await browser.close();
  return test;
 }
--- a/scraper/package-lock.json
+++ b/scraper/package-lock.json
--- a/scraper/package.json
+++ b/scraper/package.json
@ -0,0 +1,9 @@
 {
  "type": "module",
  "dependencies": {
    "puppeteer": "^23.11.0"
  },
  "scripts": {
    "start": "node index.js"
  }
 }
--- a/todo.txt
+++ b/todo.txt
@ -0,0 +1,11 @@
 [ ] Obtain embeddings for chunks of the bills
 [ ] store the chunk and embeddings in a turso DB with associated metadata for linking to the docs and updates
 [ ] see if summarizing bills is useful or loses too much info to be useful
 [ ] test semantic searches for relevant info
 [ ] figure out how to format and render diffs for the various versions of the docs a la git diff style
 [ ] profiles for users should contain enough info to allow me to build a prompt that would find relevant info for the person
 [ ] allow the user to enter their own prompts
 [ ] add periodic checking for updates
 [ ] add notifications to users when they are subscribed to a certain search
		`@ -0,0 +1,2 @@`
							`golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=`
							`golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=`