Files
llm-file-parser/main.go

169 lines
4.2 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package main
import (
"bufio"
"flag"
"fmt"
"io"
"io/ioutil"
"log"
"os"
"path/filepath"
"regexp"
"strings"
)
// fileBlock holds a file name and its content.
type fileBlock struct {
name string
content string
}
func main() {
// Define the output directory flag.
outDir := flag.String("out", ".", "Directory to write output files")
flag.Parse()
// Read input from a file or STDIN.
var data []byte
var err error
if flag.NArg() < 1 || flag.Arg(0) == "-" {
data, err = io.ReadAll(os.Stdin)
if err != nil {
log.Fatalf("Error reading from STDIN: %v", err)
}
} else {
inputFile := flag.Arg(0)
data, err = ioutil.ReadFile(inputFile)
if err != nil {
log.Fatalf("Error reading file %s: %v", inputFile, err)
}
}
text := string(data)
// First, try using the "File:" format.
reFile := regexp.MustCompile(`(?s)File:\s*(.+?)\s*\r?\n-+\r?\n(.*?)(\r?\n-+\r?\n|$)`)
matches := reFile.FindAllStringSubmatch(text, -1)
var blocks []fileBlock
if matches != nil && len(matches) > 0 {
for _, m := range matches {
blocks = append(blocks, fileBlock{
name: strings.TrimSpace(m[1]),
content: m[2],
})
}
} else {
// Fall back to parsing the numbered format manually.
blocks = parseNumberedBlocks(text)
}
if len(blocks) == 0 {
log.Println("No file entries found in the input.")
return
}
// Ensure the output directory exists.
if err := os.MkdirAll(*outDir, os.ModePerm); err != nil {
log.Fatalf("Error creating output directory %s: %v", *outDir, err)
}
// Write the blocks out as files.
for _, block := range blocks {
fullPath := filepath.Join(*outDir, block.name)
dir := filepath.Dir(fullPath)
if err := os.MkdirAll(dir, os.ModePerm); err != nil {
log.Printf("Error creating directory %s: %v", dir, err)
continue
}
if err := ioutil.WriteFile(fullPath, []byte(block.content), 0644); err != nil {
log.Printf("Error writing file %s: %v", fullPath, err)
} else {
fmt.Printf("Created file: %s\n", fullPath)
}
}
}
// parseNumberedBlocks scans the text looking for divider lines
// and header lines of the form "1. filename".
// It returns a slice of fileBlock values.
func parseNumberedBlocks(text string) []fileBlock {
var blocks []fileBlock
// We'll use a scanner to process the text line by line.
scanner := bufio.NewScanner(strings.NewReader(text))
// Regular expression to recognize a divider line (at least 10 dashes or box drawing characters)
dividerRe := regexp.MustCompile(`^[\s]*[─-]{10,}[\s]*$`)
// Regular expression for matching a header line like "1. index.html"
headerRe := regexp.MustCompile(`^\s*\d+\.\s*(.+?)\s*$`)
lines := []string{}
for scanner.Scan() {
lines = append(lines, scanner.Text())
}
i := 0
for i < len(lines) {
// Look for a divider line.
if dividerRe.MatchString(lines[i]) {
// Skip divider. Look for a header line in following lines.
j := i + 1
// Skip empty lines.
for j < len(lines) && strings.TrimSpace(lines[j]) == "" {
j++
}
if j >= len(lines) {
break
}
// Match the header.
headerLine := lines[j]
headerMatch := headerRe.FindStringSubmatch(headerLine)
if headerMatch == nil {
i++
continue
}
filename := headerMatch[1]
// Expect next divider.
j++
for j < len(lines) && strings.TrimSpace(lines[j]) == "" {
j++
}
if j >= len(lines) || !dividerRe.MatchString(lines[j]) {
// If no divider after header, this is not a wellformed block.
i = j
continue
}
// Start collecting content after this divider.
j++ // move past the divider line
contentLines := []string{}
// Continue until we see a new divider that is immediately followed by a valid header.
for j < len(lines) {
// If this line is a divider candidate,
// check if the next non-empty line is a header line.
if dividerRe.MatchString(lines[j]) {
k := j + 1
for k < len(lines) && strings.TrimSpace(lines[k]) == "" {
k++
}
if k < len(lines) && headerRe.MatchString(lines[k]) {
// We've reached the next file block.
break
}
}
contentLines = append(contentLines, lines[j])
j++
}
blocks = append(blocks, fileBlock{
name: strings.TrimSpace(filename),
content: strings.Join(contentLines, "\n"),
})
i = j
} else {
i++
}
}
return blocks
}