diff --git a/.gitignore b/.gitignore index d35bbf7..fbc85c9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ node_modules/ -.idea/ \ No newline at end of file +.idea/ +/db.sqlite diff --git a/backend/AI/ai.go b/backend/AI/ai.go new file mode 100644 index 0000000..411562f --- /dev/null +++ b/backend/AI/ai.go @@ -0,0 +1,99 @@ +package AI + +import ( + "context" + "fmt" + "github.com/pkoukk/tiktoken-go" + "github.com/sashabaranov/go-openai" + "os" +) + +// This package should use the OpenAI API to provide AI services. + +type AI interface { + // Get Embedding + GetEmbeddings(ctx context.Context, text string) (openai.EmbeddingResponse, error) + GetTokenCount(input string) (int, error) +} + +type ai struct { + apiKey string + baseURL string + encodingName string + model string + client *openai.Client +} + +type AIOption func(*ai) + +func NewAI(otps ...AIOption) (AI, error) { + a := ai{ + //baseURL: "https://api.openai.com", + encodingName: "gpt-4o", + model: openai.GPT4oMini, + } + + for _, opt := range otps { + opt(&a) + } + + if a.apiKey == "" && os.Getenv("OPENAI_API_KEY") != "" { + a.apiKey = os.Getenv("OPENAI_API_KEY") + } + if a.apiKey == "" { + return nil, fmt.Errorf("api key is required") + } + + config := openai.DefaultConfig(a.apiKey) + if a.baseURL == "" && os.Getenv("OPENAI_BASE_URL") != "" { + a.baseURL = os.Getenv("OPENAI_BASE_URL") + } + + if a.baseURL != "" { + config.BaseURL = a.baseURL + } + + a.client = openai.NewClientWithConfig(config) + + return a, nil +} + +func (a ai) GetEmbeddings(ctx context.Context, text string) (openai.EmbeddingResponse, error) { + embeddingRequest := openai.EmbeddingRequest{ + Input: text, + Model: "text-embedding-3-small", + } + + embeddings, err := a.client.CreateEmbeddings(ctx, embeddingRequest) + if err != nil { + return openai.EmbeddingResponse{}, fmt.Errorf("error creating embeddings: %w", err) + } + return embeddings, nil +} + +func WithAPIKey(apiKey string) AIOption { + return func(a *ai) { + a.apiKey = apiKey + } +} + +func WithBaseURL(baseURL string) AIOption { + return func(a *ai) { + a.baseURL = baseURL + } +} + +func WithEncodingName(encodingName string) AIOption { + return func(a *ai) { + a.encodingName = encodingName + } +} + +func (a ai) GetTokenCount(input string) (int, error) { + tke, err := tiktoken.EncodingForModel(a.encodingName) // cached in "TIKTOKEN_CACHE_DIR" + if err != nil { + return 0, fmt.Errorf("error getting encoding: %w", err) + } + token := tke.Encode(input, nil, nil) + return len(token), nil +} diff --git a/backend/Leg/utah.go b/backend/Leg/utah.go new file mode 100644 index 0000000..b9ab0e8 --- /dev/null +++ b/backend/Leg/utah.go @@ -0,0 +1,73 @@ +package Leg + +import ( + "encoding/json" + "fmt" + "os" + "time" + + "git.sa.vin/legislature-tracker/backend/cachedAPI" + "git.sa.vin/legislature-tracker/backend/types" +) + +type UtahLeg interface { + GetBillList(year, session string) (types.UtahBillList, error) + GetBillDetails(year, session, billID string) (types.UtahBill, error) +} + +type utahLeg struct { + cache cachedAPI.CachedAPI +} + +var developerToken string + +func NewUtahLeg(cache cachedAPI.CachedAPI) UtahLeg { + developerToken = os.Getenv("UTAH_DEV_TOKEN") + return &utahLeg{ + cache: cache, + } +} + +// GetBillList gets the list of bills for a given year and session, +// session should be one of "GS", "S#" where # is the session number +func (u utahLeg) GetBillList(year, session string) (types.UtahBillList, error) { + // if session is not GS it must start with S and end with a number + if session != "GS" && (session[0] != 'S' || session[1] < '0' || session[1] > '9') { + return types.UtahBillList{}, fmt.Errorf("session must be one of GS or S with some number") + } + respString, err := u.cache.Get(fmt.Sprintf("https://glen.le.utah.gov/bills/%v%v/billlist/%v", year, session, developerToken), time.Hour) + if err != nil { + return types.UtahBillList{}, fmt.Errorf("error getting bill list: %w", err) + } + if respString == "Invalid request" { + return types.UtahBillList{}, fmt.Errorf("invalid request") + } + var billList types.UtahBillList + err = json.Unmarshal([]byte(respString), &billList) + if err != nil { + return types.UtahBillList{}, fmt.Errorf("error unmarshalling bill list: %w", err) + } + return billList, nil +} + +// GetBillDetails gets the details of a bill for a given year, session, and billID +// session should be one of "GS", "S2" +func (u utahLeg) GetBillDetails(year, session, billID string) (types.UtahBill, error) { + // if session is not GS it must start with S and end with a number + if session != "GS" && (session[0] != 'S' || session[1] < '0' || session[1] > '9') { + return types.UtahBill{}, fmt.Errorf("session must be one of GS or S with some number") + } + respString, err := u.cache.Get(fmt.Sprintf("https://glen.le.utah.gov/bills/%v%v/%v/%v", year, session, billID, developerToken), time.Hour) + if err != nil { + return types.UtahBill{}, fmt.Errorf("error getting bill details: %w", err) + } + if respString == "Invalid request" { + return types.UtahBill{}, fmt.Errorf("invalid request") + } + var bill types.UtahBill + err = json.Unmarshal([]byte(respString), &bill) + if err != nil { + return types.UtahBill{}, fmt.Errorf("error unmarshalling bill details: %w", err) + } + return bill, nil +} diff --git a/backend/cachedAPI/cachedAPI.go b/backend/cachedAPI/cachedAPI.go new file mode 100644 index 0000000..e603cda --- /dev/null +++ b/backend/cachedAPI/cachedAPI.go @@ -0,0 +1,52 @@ +package cachedAPI + +import ( + "fmt" + "git.sa.vin/legislature-tracker/backend/datastore" + "io" + "net/http" + "time" +) + +// This package behaves like an API but uses libSQL as a cache that gets checked before the actual API is called. +type CachedAPI interface { + Get(url string, cacheTTL time.Duration) (string, error) +} + +type cachedAPI struct { + mapper datastore.CacheStore +} + +func NewCachedAPI(mapper datastore.CacheStore) CachedAPI { + return &cachedAPI{ + mapper: mapper, + } +} + +func (c cachedAPI) Get(url string, cacheTTL time.Duration) (string, error) { + response, found, err := c.mapper.CachedAPI(url) + if err != nil { + return "", fmt.Errorf("error getting cached API response: %w", err) + } + if found { + return response, nil + } + // Call the actual API + resp, err := http.Get(url) + if err != nil { + return "", fmt.Errorf("error calling API: %w", err) + } + defer resp.Body.Close() + // Read the response + bodyBytes, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("error reading API response: %w", err) + } + // Save the response to the cache + err = c.mapper.SaveAPIResponse(url, string(bodyBytes), cacheTTL) + if err != nil { + return "", fmt.Errorf("error saving API response: %w", err) + } + + return string(bodyBytes), nil +} diff --git a/backend/datastore/mapper.go b/backend/datastore/mapper.go new file mode 100644 index 0000000..c12037e --- /dev/null +++ b/backend/datastore/mapper.go @@ -0,0 +1,125 @@ +package datastore + +import ( + "database/sql" + "fmt" + "git.sa.vin/legislature-tracker/backend/types" + "strings" + "time" +) + +type CacheStore interface { + CachedAPI(url string) (string, bool, error) + SaveAPIResponse(url, response string, cacheTTL time.Duration) error +} + +type SearchStore interface { + SaveEmbeddings(id, content string, embeddings []float32) error + FindRelevantContent(queryEmbeddings []float32) ([]types.SearchResponse, error) +} + +type Mapper struct { + db *sql.DB +} + +func NewMapper(db *sql.DB) *Mapper { + return &Mapper{ + db: db, + } +} + +// CachedAPI returns the cached API response for the given URL +// If the URL is not in the cache it returns an empty string and false +func (m *Mapper) CachedAPI(url string) (string, bool, error) { + // Check the cache for the URL + // If the URL is in the cache, return the cached response + // Otherwise, call the API and cache the response + + query := `SELECT response, created_at, ttl FROM cache WHERE url = ?` + rows, err := m.db.Query(query, url) + if err != nil { + // norows error is not an error + if err == sql.ErrNoRows { + return "", false, nil + } + return "", false, fmt.Errorf("error reading from cache url: %v | %w", url, err) + } + defer rows.Close() + + var response struct { + Response string + CreatedAt time.Time + TTL time.Duration + } + for rows.Next() { + err = rows.Scan(&response.Response, &response.CreatedAt, &response.TTL) + if err != nil { + return "", false, fmt.Errorf("error scanning cache response: %w", err) + } + // Check if the cache is expired + if time.Since(response.CreatedAt) > response.TTL { + return "", false, nil + } + return response.Response, true, nil + } + return "", false, nil +} + +// SaveAPIResponse saves the API response to the cache +func (m *Mapper) SaveAPIResponse(url, response string, cacheTTL time.Duration) error { + // Insert the response into the cache + query := `INSERT INTO cache (url, response, ttl) VALUES (?, ?, ?)` + _, err := m.db.Exec(query, url, response, cacheTTL) + if err != nil { + if strings.Contains(err.Error(), "UNIQUE constraint failed: cache.url") { + // Update the existing row if there is a UNIQUE constraint error + updateQuery := `UPDATE cache SET response = ?, ttl = ? WHERE url = ?` + _, updateErr := m.db.Exec(updateQuery, response, cacheTTL, url) + if updateErr != nil { + return fmt.Errorf("error updating cache response: %w", updateErr) + } + return nil + } + return fmt.Errorf("error inserting cache response: %w", err) + } + return nil +} + +func (m *Mapper) SaveEmbeddings(id, content string, embeddings []float32) error { + // Insert the embeddings into the database + query := `INSERT INTO searchable_content (trackingid, content, full_emb) VALUES (?, ?, vector32(?))` + _, err := m.db.Exec(query, id, content, serializeEmbeddings(embeddings)) + if err != nil { + return fmt.Errorf("error inserting embeddings: %w", err) + } + return nil +} + +func serializeEmbeddings(embeddings []float32) string { + return strings.Join(strings.Split(fmt.Sprintf("%v", embeddings), " "), ", ") +} + +func (m *Mapper) FindRelevantContent(queryEmbeddings []float32) ([]types.SearchResponse, error) { + // Find the relevant content in the database + query := `SELECT searchable_content.trackingid, searchable_content.content FROM vector_top_k('emb_idx', vector32(?), 10) JOIN searchable_content ON id = searchable_content.rowid` + rows, err := m.db.Query(query, serializeEmbeddings(queryEmbeddings)) + if err != nil { + // norows error is not an error + if err == sql.ErrNoRows { + return nil, nil + } + return nil, fmt.Errorf("error querying embeddings: %w", err) + } + defer rows.Close() + + var results []types.SearchResponse + for rows.Next() { + var result types.SearchResponse + err = rows.Scan(&result.TrackingID, &result.Content) + if err != nil { + return nil, fmt.Errorf("error scanning embeddings: %w", err) + } + results = append(results, result) + } + return results, nil +} diff --git a/backend/datastore/mapper_test.go b/backend/datastore/mapper_test.go new file mode 100644 index 0000000..00c542f --- /dev/null +++ b/backend/datastore/mapper_test.go @@ -0,0 +1,36 @@ +package datastore + +import "testing" + +func Benchmark_mySerializedEmbeddings(b *testing.B) { + type args struct { + embeddings []float32 + } + tests := []struct { + name string + args args + want string + }{ + { + name: "Test 1", + args: args{ + embeddings: []float32{0.1, 0.2, 0.3}, + }, + want: "[0.1, 0.2, 0.3]", + }, + { + name: "Crazy long test", + args: args{ + embeddings: []float32{0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, + }, + want: "[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]", + }, + } + for _, tt := range tests { + b.Run(tt.name, func(t *testing.B) { + if got := serializeEmbeddings(tt.args.embeddings); got != tt.want { + t.Errorf("mySerializedEmbeddings() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/backend/main.go b/backend/main.go new file mode 100644 index 0000000..d8513c2 --- /dev/null +++ b/backend/main.go @@ -0,0 +1,79 @@ +package main + +import ( + "embed" + "git.sa.vin/legislature-tracker/backend/AI" + "git.sa.vin/legislature-tracker/backend/search" + "log" + "os" + + "git.sa.vin/legislature-tracker/backend/Leg" + "git.sa.vin/legislature-tracker/backend/cachedAPI" + "git.sa.vin/legislature-tracker/backend/datastore" + "github.com/payne8/go-libsql-dual-driver" +) + +//go:embed migrations/*.sql +var migrationFiles embed.FS + +func main() { + + logger := log.New(os.Stdout, "any-remark", log.LstdFlags) + primaryUrl := os.Getenv("LIBSQL_DATABASE_URL") + authToken := os.Getenv("LIBSQL_AUTH_TOKEN") + + tdb, err := libsqldb.NewLibSqlDB( + primaryUrl, + libsqldb.WithMigrationFiles(migrationFiles), + libsqldb.WithAuthToken(authToken), + libsqldb.WithLocalDBName("local.db"), // will not be used for remote-only + ) + if err != nil { + logger.Printf("failed to open db %s: %s", primaryUrl, err) + log.Fatalln(err) + return + } + err = tdb.Migrate() + if err != nil { + logger.Printf("failed to migrate db %s: %s", primaryUrl, err) + log.Fatalln(err) + return + } + + mapper := datastore.NewMapper(tdb.DB) + api := cachedAPI.NewCachedAPI(mapper) + utah := Leg.NewUtahLeg(api) + ai, err := AI.NewAI() + if err != nil { + log.Fatalf("error creating AI: %v", err) + } + searchService, err := search.NewSearch(search.WithAI(ai), search.WithMapper(mapper)) + if err != nil { + log.Fatalf("error creating search: %v", err) + } + + test, err := utah.GetBillList("2024", "GS") + if err != nil { + log.Fatalf("error getting bill list: %v", err) + } + log.Printf("bill list: %+v", test) + + test2, err := utah.GetBillDetails("2024", "GS", "HB0001") + if err != nil { + log.Fatalf("error getting bill details: %v", err) + } + log.Printf("bill details: %+v", test2) + + //err = searchService.InsertContent(context.Background(), test2.TrackingID, test2.GeneralProvisions+" "+test2.HilightedProvisions) + //if err != nil { + // log.Fatalf("error inserting content: %v", err) + //} + + results, err := searchService.Search("I'm looking for a bill that affects public education") + if err != nil { + log.Fatalf("error searching: %v", err) + } + + log.Printf("search results: %+v", results) + +} diff --git a/backend/migrations/2025-01-03-0001-add-searchable-content.sql b/backend/migrations/2025-01-03-0001-add-searchable-content.sql new file mode 100644 index 0000000..9030944 --- /dev/null +++ b/backend/migrations/2025-01-03-0001-add-searchable-content.sql @@ -0,0 +1,8 @@ +CREATE TABLE searchable_content ( + trackingid TEXT NOT NULL, + content TEXT NOT NULL, + full_emb F32_BLOB(1536) NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE INDEX emb_idx ON searchable_content (libsql_vector_idx(full_emb)); \ No newline at end of file diff --git a/backend/migrations/2025-01-03-init.sql b/backend/migrations/2025-01-03-init.sql new file mode 100644 index 0000000..2abbd36 --- /dev/null +++ b/backend/migrations/2025-01-03-init.sql @@ -0,0 +1,11 @@ +CREATE TABLE IF NOT EXISTS cache ( + id INTEGER PRIMARY KEY, + url TEXT NOT NULL UNIQUE, + response TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + ttl INTEGER DEFAULT 0 +); + +CREATE INDEX idx_url ON cache (url); +CREATE INDEX idx_created_at ON cache (created_at); + diff --git a/backend/search/search.go b/backend/search/search.go new file mode 100644 index 0000000..7bdb2d3 --- /dev/null +++ b/backend/search/search.go @@ -0,0 +1,77 @@ +package search + +import ( + "context" + "fmt" + "git.sa.vin/legislature-tracker/backend/AI" + "git.sa.vin/legislature-tracker/backend/datastore" + "git.sa.vin/legislature-tracker/backend/types" +) + +type Search interface { + Search(query string) ([]types.SearchResponse, error) + InsertContent(ctx context.Context, id string, content string) error +} + +type SearchOption func(s *search) + +func NewSearch(opts ...SearchOption) (Search, error) { + s := &search{} + for _, opt := range opts { + opt(s) + } + if s.ai == nil { + return nil, fmt.Errorf("AI is required") + } + if s.mapper == nil { + return nil, fmt.Errorf("mapper is required") + } + return s, nil +} + +func WithMapper(mapper datastore.SearchStore) func(s *search) { + return func(s *search) { + s.mapper = mapper + } +} + +func WithAI(ai AI.AI) func(s *search) { + return func(s *search) { + s.ai = ai + } +} + +type search struct { + ai AI.AI + mapper datastore.SearchStore +} + +func (s search) Search(query string) ([]types.SearchResponse, error) { + // get embeddings for the query + embeddings, err := s.ai.GetEmbeddings(context.Background(), query) + if err != nil { + return nil, fmt.Errorf("error getting embeddings: %w", err) + } + if len(embeddings.Data) == 0 { + return nil, fmt.Errorf("no embeddings returned") + } + // find relevant content in the database + return s.mapper.FindRelevantContent(embeddings.Data[0].Embedding) +} + +func (s search) InsertContent(ctx context.Context, id string, content string) error { + // get embeddings for the content + embeddings, err := s.ai.GetEmbeddings(ctx, content) + if err != nil { + return fmt.Errorf("error getting embeddings: %w", err) + } + if len(embeddings.Data) == 0 { + return fmt.Errorf("no embeddings returned") + } + // save the embeddings to the database + err = s.mapper.SaveEmbeddings(id, content, embeddings.Data[0].Embedding) + if err != nil { + return fmt.Errorf("error saving embeddings: %w", err) + } + return nil +} diff --git a/backend/types/search.go b/backend/types/search.go new file mode 100644 index 0000000..fe3a0fd --- /dev/null +++ b/backend/types/search.go @@ -0,0 +1,6 @@ +package types + +type SearchResponse struct { + TrackingID string + Content string +} diff --git a/backend/types/utah.go b/backend/types/utah.go new file mode 100644 index 0000000..0b6bfba --- /dev/null +++ b/backend/types/utah.go @@ -0,0 +1,33 @@ +package types + +// UtahBill is a struct that represents a bill in the Utah legislature +type UtahBill struct { + Bill string `json:"bill"` + Version string `json:"version"` + ShortTitle string `json:"shorttitle"` + Sponsor string `json:"sponsor"` + FloorSponsor string `json:"floorsponsor"` + GeneralProvisions string `json:"generalprovisions"` + HilightedProvisions string `json:"hilightedprovisions"` + Monies string `json:"monies"` + Attorney string `json:"attorney"` + FiscalAnalyst string `json:"fiscalanalyst"` + LastAction string `json:"lastaction"` + LastActionOwner string `json:"lastactionowner"` + LastActionTime string `json:"lastactiontime"` + TrackingID string `json:"trackingid"` + Subjects []string `json:"subjects"` + CodeSections []string `json:"codesections"` + Agendas []string `json:"agendas"` +} + +// UtahBillListItem is a struct that represents a bill in a list of bills +type UtahBillListItem struct { + Number string `json:"number"` + UpdateTime string `json:"updatetime"` +} + +// UtahBillList is a struct that represents a list of bills in the Utah legislature +type UtahBillList struct { + Bills []UtahBillListItem `json:"bills"` +} diff --git a/go.mod b/go.mod index f430008..b8f6828 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,21 @@ module git.sa.vin/legislature-tracker go 1.23 -require golang.org/x/net v0.33.0 +require ( + github.com/payne8/go-libsql-dual-driver v0.2.3 + github.com/pkoukk/tiktoken-go v0.1.7 + github.com/sashabaranov/go-openai v1.36.1 +) + +require ( + github.com/antlr4-go/antlr/v4 v4.13.0 // indirect + github.com/dlclark/regexp2 v1.10.0 // indirect + github.com/google/uuid v1.3.0 // indirect + github.com/hashicorp/errwrap v1.0.0 // indirect + github.com/hashicorp/go-multierror v1.1.1 // indirect + github.com/libsql/sqlite-antlr4-parser v0.0.0-20240327125255-dbf53b6cbf06 // indirect + github.com/tursodatabase/go-libsql v0.0.0-20240429120401-651096bbee0b // indirect + github.com/tursodatabase/libsql-client-go v0.0.0-20240628122535-1c47b26184e8 // indirect + golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 // indirect + nhooyr.io/websocket v1.8.10 // indirect +) diff --git a/go.sum b/go.sum index 16660ab..6a1a583 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,42 @@ -golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= -golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= +github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0= +github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= +github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= +github.com/libsql/sqlite-antlr4-parser v0.0.0-20240327125255-dbf53b6cbf06 h1:JLvn7D+wXjH9g4Jsjo+VqmzTUpl/LX7vfr6VOfSWTdM= +github.com/libsql/sqlite-antlr4-parser v0.0.0-20240327125255-dbf53b6cbf06/go.mod h1:FUkZ5OHjlGPjnM2UyGJz9TypXQFgYqw6AFNO1UiROTM= +github.com/payne8/go-libsql-dual-driver v0.2.3 h1:ea19rrdn3QQqvDrHNZ5gqqj2Nn7DbhGDVvDL4UDYZ68= +github.com/payne8/go-libsql-dual-driver v0.2.3/go.mod h1:fhe8WdGtBLvGZ5drN9We0uWEedXeCCTvWaTLExrGW9M= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkoukk/tiktoken-go v0.1.7 h1:qOBHXX4PHtvIvmOtyg1EeKlwFRiMKAcoMp4Q+bLQDmw= +github.com/pkoukk/tiktoken-go v0.1.7/go.mod h1:9NiV+i9mJKGj1rYOT+njbv+ZwA/zJxYdewGl6qVatpg= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sashabaranov/go-openai v1.36.1 h1:EVfRXwIlW2rUzpx6vR+aeIKCK/xylSrVYAx1TMTSX3g= +github.com/sashabaranov/go-openai v1.36.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tursodatabase/go-libsql v0.0.0-20240429120401-651096bbee0b h1:R7hev4b96zgXjKbS2ZNbHBnDvyFZhH+LlMqtKH6hIkU= +github.com/tursodatabase/go-libsql v0.0.0-20240429120401-651096bbee0b/go.mod h1:TjsB2miB8RW2Sse8sdxzVTdeGlx74GloD5zJYUC38d8= +github.com/tursodatabase/libsql-client-go v0.0.0-20240628122535-1c47b26184e8 h1:XM3aeBrpNrkvi48EiKCtMNAgsiaAaAOCHAW9SaIWouo= +github.com/tursodatabase/libsql-client-go v0.0.0-20240628122535-1c47b26184e8/go.mod h1:fblU7nZYWAROzJzkpln8teKFDtdRvAOmZHeIpahY4jk= +golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 h1:aAcj0Da7eBAtrTp03QXWvm88pSyOt+UgdZw2BFZ+lEw= +golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8/go.mod h1:CQ1k9gNrJ50XIzaKCRR2hssIjF07kZFEiieALBM/ARQ= +golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo= +gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= +nhooyr.io/websocket v1.8.10 h1:mv4p+MnGrLDcPlBoWsvPP7XCzTYMXP9F9eIGoKbgx7Q= +nhooyr.io/websocket v1.8.10/go.mod h1:rN9OFWIUwuxg4fR5tELlYC04bXYowCP9GX47ivo2l+c= diff --git a/main.go b/main.go deleted file mode 100644 index 3968b12..0000000 --- a/main.go +++ /dev/null @@ -1,482 +0,0 @@ -package main - -import ( - "fmt" - "log" - "strings" - - "golang.org/x/net/html" -) - -func main() { - htmlStr := ` - - - - - - - - - - - - - - - - - - - -HB0030 - - - - - - - - -
-
- - -
- -
-
- -

H.B. 30 Road Rage Amendments

- -
-
-
Bill Sponsor:

Rep. Cutler, Paul A.
Floor Sponsor:

Sen. Weiler, Todd D.
  • CoSponsor(s):
        Acton, C.K.  Pierucci, C.
        Stoddard, A.
  • Drafting Attorney: Jacqueline Carlton
  • Fiscal Analyst: Gary R. Syphus

- -
-
-
-
  • Information
    • Last Action: 18 Mar 2024, Governor Signed
    • Last Location: Lieutenant Governor's office for filing

-
- -
-
-
-
-
-Bill Status / Votes
-
• Senate Actions • House Actions • Fiscal Actions • Other Actions
DateActionLocationVote
12/19/2023 Bill Numbered but not DistributedLegislative Research and General Counsel
12/19/2023 Numbered Bill Publicly DistributedLegislative Research and General Counsel
1/10/2024 House/ received bill from Legislative ResearchClerk of the House
1/16/2024 House/ 1st reading (Introduced)House Rules Committee
1/25/2024 House/ received fiscal note from Fiscal AnalystHouse Rules Committee
1/31/2024 House/ to standing committeeHouse Law Enforcement and Criminal Justice Committee
2/9/2024 House Comm - Substitute Recommendation from # 0 to # 3House Law Enforcement and Criminal Justice Committee7 0 5
2/9/2024 House Comm - Favorable RecommendationHouse Law Enforcement and Criminal Justice Committee7 0 5
2/12/2024 (10:18:50 AM)House/ comm rpt/ substitutedHouse Law Enforcement and Criminal Justice Committee
2/12/2024 (10:18:51 AM)House/ 2nd readingHouse 3rd Reading Calendar for House bills
2/12/2024 LFA/ fiscal note sent to sponsorHouse 3rd Reading Calendar for House bills
2/12/2024 LFA/ fiscal note publicly availableHouse 3rd Reading Calendar for House bills
2/16/2024 (11:43:56 AM)House/ 3rd readingHouse 3rd Reading Calendar for House bills
2/16/2024 (11:49:30 AM)House/ floor amendment # 1House 3rd Reading Calendar for House billsVoice vote
2/16/2024 (11:58:00 AM)House/ passed 3rd readingSenate Secretary51 17 7
2/16/2024 (11:58:02 AM)House/ to SenateSenate Secretary
2/16/2024 Senate/ received from HouseWaiting for Introduction in the Senate
2/16/2024 Senate/ 1st reading (Introduced)Senate Rules Committee
2/20/2024 Senate/ to standing committeeSenate Judiciary, Law Enforcement, and Criminal Justice Committee
2/22/2024 Senate Comm - Favorable RecommendationSenate Judiciary, Law Enforcement, and Criminal Justice Committee3 0 3
2/22/2024 (2:19:24 PM)Senate/ committee report favorableSenate Judiciary, Law Enforcement, and Criminal Justice Committee
2/22/2024 (2:19:25 PM)Senate/ placed on 2nd Reading CalendarSenate 2nd Reading Calendar
2/27/2024 Senate/ 2nd Reading Calendar to RulesSenate Rules Committee
2/28/2024 Senate/ Rules to 2nd Reading CalendarSenate 2nd Reading Calendar
2/28/2024 (12:09:34 PM)Senate/ 2nd & 3rd readings/ suspensionSenate 2nd Reading Calendar
2/28/2024 (12:09:48 PM)Senate/ circledSenate 2nd Reading CalendarVoice vote
2/28/2024 (8:05:19 PM)Senate/ uncircledSenate 2nd Reading CalendarVoice vote
2/28/2024 (8:11:13 PM)Senate/ passed 2nd & 3rd readings/ suspensionSenate President23 5 1
2/28/2024 (8:11:14 PM)Senate/ signed by President/ returned to HouseHouse Speaker
2/28/2024 (8:11:15 PM)Senate/ to HouseHouse Speaker
2/29/2024 House/ received from SenateHouse Speaker
2/29/2024 House/ signed by Speaker/ sent for enrollingLegislative Research and General Counsel / Enrolling
2/29/2024 Bill Received from House for EnrollingLegislative Research and General Counsel / Enrolling
2/29/2024 Draft of Enrolled Bill PreparedLegislative Research and General Counsel / Enrolling
3/7/2024 Enrolled Bill Returned to House or SenateClerk of the House
3/7/2024 House/ enrolled bill to PrintingClerk of the House
3/8/2024 House/ received enrolled bill from PrintingClerk of the House
3/8/2024 House/ to GovernorExecutive Branch - Governor
3/18/2024 Governor SignedLieutenant Governor's office for filing
-
- - -
- - - - - - - - - - - - -` - - doc, err := html.Parse(strings.NewReader(htmlStr)) - if err != nil { - log.Fatal(err) - } - - var extractText func(*html.Node) - extractText = func(n *html.Node) { - if n.Type == html.TextNode { - fmt.Printf("%v\n", n.Data) - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - extractText(c) - } - } - - extractText(doc) -} diff --git a/scraper/index.js b/scraper/index.js index 3ee606d..6ca89b9 100644 --- a/scraper/index.js +++ b/scraper/index.js @@ -1,14 +1,26 @@ import puppeteer from 'puppeteer'; -const startingLing = 'https://le.utah.gov/~2024/bills/static/HB0030.html'; +// const startingLink = 'https://le.utah.gov/~2024/bills/static/HB0030.html'; +// const startingLink = 'https://le.utah.gov/~2025/bills/static/HB0011.html'; +const startingLink = 'https://le.utah.gov/~2025/bills/static/HB0012.html'; (async () => { // Launch the browser and open a new blank page const browser = await puppeteer.launch({ headless: false, }); - let text = await getPageText(browser, startingLing); + let text = await getPageText(browser, startingLink); - console.log(text); + const lines = text.join(' ').split('. '); + + console.log(lines.join('.\n')); + + let totalChars = 0; + for (let line of lines) { + totalChars += line.length; + } + console.log('Total chars:', totalChars); + console.log('Total lines:', lines.length); + console.log('Average chars per line:', totalChars / lines.length); })(); @@ -17,6 +29,7 @@ async function getPageText(browser, url) { await page.goto(url); const test = await page.evaluate(() => { + // ------------------- in the browser context ------------------- // Use the querySelector to target the leg element const legElement = document.querySelector('leg'); if (legElement) { @@ -42,6 +55,8 @@ async function getPageText(browser, url) { }) .filter((text) => text.length > 0); // Filter out any leftover empty strings } + + // ------------------- in the browser context ------------------- }); await browser.close();