diff options
| author | Xe Iaso <me@xeiaso.net> | 2024-06-16 18:50:48 -0400 |
|---|---|---|
| committer | Xe Iaso <me@xeiaso.net> | 2024-06-16 18:50:48 -0400 |
| commit | 6fbb0c3894bb3dfff5fb5487ac581f9c942a97bc (patch) | |
| tree | 7095d60a6723ab845f6b5ad5dc18f5fc98403198 | |
| parent | 9dd33711159888efee90ec90f22ef8a8b92c310a (diff) | |
| download | x-6fbb0c3894bb3dfff5fb5487ac581f9c942a97bc.tar.xz x-6fbb0c3894bb3dfff5fb5487ac581f9c942a97bc.zip | |
cmd/hnscrape: assemble training data, oh god this is a bad idea
Signed-off-by: Xe Iaso <me@xeiaso.net>
| -rw-r--r-- | .go.mod.sri | 2 | ||||
| -rw-r--r-- | cmd/hnscrape/main.go | 242 | ||||
| -rw-r--r-- | go.mod | 1 | ||||
| -rw-r--r-- | go.sum | 2 |
4 files changed, 146 insertions, 101 deletions
diff --git a/.go.mod.sri b/.go.mod.sri index 63dda45..675cada 100644 --- a/.go.mod.sri +++ b/.go.mod.sri @@ -1 +1 @@ -sha256-sNob6z/emAimhdeI9xUkiPpV9lF2SeNpYNIK5JXZ51I= +sha256-ORnkWO720g6WQy58lrUqeSb7E1JvrymJS5a/PuPYmFs= diff --git a/cmd/hnscrape/main.go b/cmd/hnscrape/main.go index 78dc4f4..1c94a75 100644 --- a/cmd/hnscrape/main.go +++ b/cmd/hnscrape/main.go @@ -2,7 +2,10 @@ package main import ( "context" + "crypto/sha256" + "encoding/json" "flag" + "fmt" "log/slog" "os" "os/signal" @@ -10,7 +13,9 @@ import ( "syscall" "time" + "jaytaylor.com/html2text" "within.website/x/internal" + "within.website/x/llm" ) var ( @@ -19,20 +24,23 @@ var ( scrapeDelay = flag.Duration("scrape-delay", 50*time.Millisecond, "Delay between scraping items") ) +const systemMessage = `You are a commenter on the website "Hacker News". If asked for your name, you will respond with "Mimi". You should be friendly unless people are being mean to you, then you can be mean back.` + func main() { internal.HandleStartup() ctx, cancel := ControlCContext() defer cancel() - slog.Info("starting hnscrape", "scrapeDelay", scrapeDelay.String(), "hnUser", *hnUser) + slog.Debug("starting hnscrape", "scrapeDelay", scrapeDelay.String(), "hnUser", *hnUser) hn := NewHNClient(*scrapeDelay) if *cacheFolder != "" { - slog.Info("caching items to", "cacheFolder", *cacheFolder) + slog.Debug("caching items to", "cacheFolder", *cacheFolder) os.MkdirAll(*cacheFolder, 0755) os.MkdirAll(filepath.Join(*cacheFolder, "items"), 0755) os.MkdirAll(filepath.Join(*cacheFolder, "indices"), 0755) + os.MkdirAll(filepath.Join(*cacheFolder, "conversations"), 0755) hn = hn.WithCacheFolder(*cacheFolder) } @@ -42,114 +50,141 @@ func main() { os.Exit(1) } - slog.Info("got user", "user", u.Created.String(), "karma", u.Karma, "submitted", len(u.Submitted)) - - // itemsCommentedIn := make([]int, 0) - // - // wg := sync.WaitGroup{} - // - // wg.Add(len(u.Submitted)) - // - // for _, itemID := range u.Submitted { - // itemID := itemID - // - // item, err := hn.GetItem(ctx, itemID) - // if err != nil { - // slog.Error("failed to get item", "err", err, "itemID", itemID) - // continue - // } - // - // slog.Debug("got item", "item", item.ID) - // - // go func(hn *HNClient, item *HNItem) { - // defer wg.Done() - // - // if item.Type != "comment" { - // return - // } - // - // if item.Parent == nil { - // return - // } - // - // /* - // if len(item.Kids) != 0 { - // slog.Info("getting comment kids", "item", item.ID) - // for _, kid := range item.Kids { - // kid, err := hn.GetItem(ctx, kid) - // if err != nil { - // slog.Error("failed to get kid", "err", err, "kid", kid) - // continue - // } - // - // slog.Info("got kid", "kid", kid.ID) - // } - // }*/ - // - // slog.Debug("getting comment parent", "parent", item.Parent) - // - // parent, err := hn.GetItem(ctx, *item.Parent) - // if err != nil { - // if err == context.Canceled { - // return - // } - // slog.Error("failed to get parent", "err", err, "parent", item.Parent) - // return - // } - // - // slog.Debug("got parent", "parent", parent.ID) - // /* - // for _, kid := range parent.Kids { - // kid, err := hn.GetItem(ctx, kid) - // if err != nil { - // slog.Error("failed to get kid", "err", err, "kid", kid) - // continue - // } - // - // slog.Info("got kid", "kid", kid.ID) - // }*/ - // }(hn, item) - // - // /* - // if item.Type == "comment" { - // ultimateParent, err := hn.GetUltimateParent(ctx, item.ID) - // if err != nil { - // if err == context.Canceled { - // break - // } - // slog.Error("failed to get ultimate parent", "err", err, "item", item.ID) - // continue - // } - // itemsCommentedIn = append(itemsCommentedIn, ultimateParent.ID) - // } - // */ - // } - // - // wg.Wait() - - /* - slog.Info("done", "itemsCommentedIn", len(itemsCommentedIn)) - - fout, err := os.Create(filepath.Join(*cacheFolder, "indices", "itemsCommentedIn")) + slog.Debug("got user", "user", u.Created.String(), "karma", u.Karma, "submitted", len(u.Submitted)) + + reverseIntSlice(u.Submitted) + + conversations := map[string][]int{} + + for _, itemID := range u.Submitted { + item, err := hn.GetItem(ctx, itemID) if err != nil { - slog.Error("failed to create itemsCommentedIn file", "err", err) + slog.Error("failed to get item", "err", err, "itemID", itemID) os.Exit(1) } - defer fout.Close() - if err := json.NewEncoder(fout).Encode(itemsCommentedIn); err != nil { - slog.Error("failed to write itemsCommentedIn", "err", err) - os.Exit(1) + if item.Type != "comment" { + continue + } + + if item.Parent == nil { + continue + } + + parent, err := hn.GetItem(ctx, *item.Parent) + if err != nil { + slog.Error("failed to get parent", "err", err, "itemID", item.ID) + continue } - */ + _ = parent - pathToRoot, err := hn.PathToRoot(ctx, 40699123) + pathToRoot, err := hn.PathToRoot(ctx, item.ID) + if err != nil { + slog.Error("failed to get path to root", "err", err, "itemID", item.ID) + continue + } + + conversationID, err := getConversationIDName(pathToRoot) + if err != nil { + slog.Error("failed to get conversation ID", "err", err, "itemID", item.ID) + continue + } + + slog.Info("got conversation ID", "itemID", item.ID, "conversationID", conversationID) + + conversations[conversationID] = pathToRoot + } + + fout, err := os.Create(filepath.Join(*cacheFolder, "train.jsonl")) if err != nil { - slog.Error("failed to get path to root", "err", err) + slog.Error("failed to create train file", "err", err) + os.Exit(1) + } + defer fout.Close() + + for conversationID, path := range conversations { + items := []*HNItem{} + + for _, itemID := range path { + item, err := hn.GetItem(ctx, itemID) + if err != nil { + slog.Error("failed to get item", "err", err, "itemID", itemID) + os.Exit(1) + } + + items = append(items, item) + } + + messages := []llm.Message{} + + for i, item := range items { + _ = i + text := item.Text + role := "user" + + if item.Type == "story" { + role = "system" + text = systemMessage + "\n\n" + item.URL + ": " + item.Title + } + + if item.By == *hnUser { + role = "assistant" + } + + if role == "user" && len(items) > i+1 && items[i+1].By != *hnUser { + next := items[i+1] + next.Text = text + "\n\n" + next.Text + continue + } + + plainText, err := html2text.FromString(text, html2text.Options{OmitLinks: true}) + if err != nil { + slog.Error("failed to convert HTML to text", "err", err, "itemID", item.ID) + os.Exit(1) + } + + messages = append(messages, llm.Message{ + Role: role, + Content: plainText, + }) + } + + if err := json.NewEncoder(fout).Encode(messages); err != nil { + slog.Error("failed to write conversation", "err", err, "conversationID", conversationID) + os.Exit(1) + } + } + + if err := json.NewEncoder(fout).Encode([]llm.Message{ + { + Role: "system", + Content: systemMessage + "\n\nhttps://xeiaso.net: Xe Iaso", + }, + { + Role: "user", + Content: "What is your name?", + }, + { + Role: "assistant", + Content: "My name is Mimi, duh!", + }, + }); err != nil { + slog.Error("failed to write conversation", "err", err) os.Exit(1) } +} + +func getConversationIDName(path []int) (string, error) { + if len(path) == 0 { + return "", fmt.Errorf("path is empty you goofus") + } - slog.Info("got path to root", "pathToRoot", pathToRoot) + h := sha256.New() + if err := json.NewEncoder(h).Encode(path); err != nil { + return "", fmt.Errorf("failed to encode path: %w", err) + } + + return fmt.Sprintf("%x", h.Sum(nil)), nil } func ControlCContext() (context.Context, context.CancelFunc) { @@ -166,3 +201,10 @@ func ControlCContext() (context.Context, context.CancelFunc) { return ctx, cancel } + +func reverseIntSlice(s []int) { + for i := len(s)/2 - 1; i >= 0; i-- { + opp := len(s) - 1 - i + s[i], s[opp] = s[opp], s[i] + } +} @@ -225,6 +225,7 @@ require ( google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect + jaytaylor.com/html2text v0.0.0-20230321000545-74c2419ad056 // indirect lukechampine.com/blake3 v1.2.1 // indirect modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 // indirect modernc.org/libc v1.50.9 // indirect @@ -1657,6 +1657,8 @@ honnef.co/go/transmission v0.0.0-20200712215954-58262f0ca9c9 h1:gvbgV/dEfmGblqVW honnef.co/go/transmission v0.0.0-20200712215954-58262f0ca9c9/go.mod h1:2zVSJxgSdzVCl199AZjjT78vllJowC9UnQJ5l3X5e0M= howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM= howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g= +jaytaylor.com/html2text v0.0.0-20230321000545-74c2419ad056 h1:6YFJoB+0fUH6X3xU/G2tQqCYg+PkGtnZ5nMR5rpw72g= +jaytaylor.com/html2text v0.0.0-20230321000545-74c2419ad056/go.mod h1:OxvTsCwKosqQ1q7B+8FwXqg4rKZ/UG9dUW+g/VL2xH4= lukechampine.com/blake3 v1.2.1 h1:YuqqRuaqsGV71BV/nm9xlI0MKUv4QC54jQnBChWbGnI= lukechampine.com/blake3 v1.2.1/go.mod h1:0OFRp7fBtAylGVCO40o87sbupkyIGgbpv1+M1k1LM6k= modernc.org/cc/v4 v4.21.2 h1:dycHFB/jDc3IyacKipCNSDrjIC0Lm1hyoWOZTRR20Lk= |
