From 4ca6dfb51d7d5c52a086179172412f8bc422a971 Mon Sep 17 00:00:00 2001 From: Mason Payne Date: Mon, 3 Feb 2025 23:49:19 -0700 Subject: [PATCH] add support for a few more formats --- .gitignore | 1 + main.go | 134 +++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 115 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 3fd7c43..8216995 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /response.txt /llmparse.exe +/response2.txt diff --git a/main.go b/main.go index 3634fd5..e012e24 100644 --- a/main.go +++ b/main.go @@ -1,6 +1,7 @@ package main import ( + "bufio" "flag" "fmt" "io" @@ -9,18 +10,23 @@ import ( "os" "path/filepath" "regexp" + "strings" ) +// fileBlock holds a file name and its content. +type fileBlock struct { + name string + content string +} + func main() { // Define the output directory flag. outDir := flag.String("out", ".", "Directory to write output files") flag.Parse() - // Determine source of input - either a file or STDIN. + // Read input from a file or STDIN. var data []byte var err error - - // If no positional arguments or "-" is provided, read from STDIN. if flag.NArg() < 1 || flag.Arg(0) == "-" { data, err = io.ReadAll(os.Stdin) if err != nil { @@ -33,13 +39,26 @@ func main() { log.Fatalf("Error reading file %s: %v", inputFile, err) } } - text := string(data) - // Updated regular expression to handle both LF and CRLF line breaks. - re := regexp.MustCompile(`(?s)File:\s*(.+?)\s*\r?\n-+\r?\n(.*?)(\r?\n-+\r?\n|$)`) - matches := re.FindAllStringSubmatch(text, -1) - if matches == nil { + // First, try using the "File:" format. + reFile := regexp.MustCompile(`(?s)File:\s*(.+?)\s*\r?\n-+\r?\n(.*?)(\r?\n-+\r?\n|$)`) + matches := reFile.FindAllStringSubmatch(text, -1) + + var blocks []fileBlock + if matches != nil && len(matches) > 0 { + for _, m := range matches { + blocks = append(blocks, fileBlock{ + name: strings.TrimSpace(m[1]), + content: m[2], + }) + } + } else { + // Fall back to parsing the numbered format manually. + blocks = parseNumberedBlocks(text) + } + + if len(blocks) == 0 { log.Println("No file entries found in the input.") return } @@ -49,26 +68,101 @@ func main() { log.Fatalf("Error creating output directory %s: %v", *outDir, err) } - for _, match := range matches { - // match[1] is the file name, and match[2] is the file content. - fileName := match[1] - fileContent := match[2] - - // Create the full file path using the output directory. - fullPath := filepath.Join(*outDir, fileName) - - // Ensure that the directory for the file exists (in case there's nested directories). + // Write the blocks out as files. + for _, block := range blocks { + fullPath := filepath.Join(*outDir, block.name) dir := filepath.Dir(fullPath) if err := os.MkdirAll(dir, os.ModePerm); err != nil { log.Printf("Error creating directory %s: %v", dir, err) continue } - - if err := ioutil.WriteFile(fullPath, []byte(fileContent), 0644); err != nil { - log.Printf("Error writing to file %s: %v", fullPath, err) + if err := ioutil.WriteFile(fullPath, []byte(block.content), 0644); err != nil { + log.Printf("Error writing file %s: %v", fullPath, err) } else { fmt.Printf("Created file: %s\n", fullPath) } } } + +// parseNumberedBlocks scans the text looking for divider lines +// and header lines of the form "1. filename". +// It returns a slice of fileBlock values. +func parseNumberedBlocks(text string) []fileBlock { + var blocks []fileBlock + + // We'll use a scanner to process the text line by line. + scanner := bufio.NewScanner(strings.NewReader(text)) + // Regular expression to recognize a divider line (at least 10 dashes or box drawing characters) + dividerRe := regexp.MustCompile(`^[\s]*[─-]{10,}[\s]*$`) + // Regular expression for matching a header line like "1. index.html" + headerRe := regexp.MustCompile(`^\s*\d+\.\s*(.+?)\s*$`) + + lines := []string{} + for scanner.Scan() { + lines = append(lines, scanner.Text()) + } + + i := 0 + for i < len(lines) { + // Look for a divider line. + if dividerRe.MatchString(lines[i]) { + // Skip divider. Look for a header line in following lines. + j := i + 1 + // Skip empty lines. + for j < len(lines) && strings.TrimSpace(lines[j]) == "" { + j++ + } + if j >= len(lines) { + break + } + // Match the header. + headerLine := lines[j] + headerMatch := headerRe.FindStringSubmatch(headerLine) + if headerMatch == nil { + i++ + continue + } + filename := headerMatch[1] + // Expect next divider. + j++ + for j < len(lines) && strings.TrimSpace(lines[j]) == "" { + j++ + } + if j >= len(lines) || !dividerRe.MatchString(lines[j]) { + // If no divider after header, this is not a well‐formed block. + i = j + continue + } + // Start collecting content after this divider. + j++ // move past the divider line + contentLines := []string{} + // Continue until we see a new divider that is immediately followed by a valid header. + for j < len(lines) { + // If this line is a divider candidate, + // check if the next non-empty line is a header line. + if dividerRe.MatchString(lines[j]) { + k := j + 1 + for k < len(lines) && strings.TrimSpace(lines[k]) == "" { + k++ + } + if k < len(lines) && headerRe.MatchString(lines[k]) { + // We've reached the next file block. + break + } + } + contentLines = append(contentLines, lines[j]) + j++ + } + blocks = append(blocks, fileBlock{ + name: strings.TrimSpace(filename), + content: strings.Join(contentLines, "\n"), + }) + i = j + } else { + i++ + } + } + return blocks + +}