aboutsummaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorXe Iaso <me@xeiaso.net>2024-06-16 18:50:48 -0400
committerXe Iaso <me@xeiaso.net>2024-06-16 18:50:48 -0400
commit6fbb0c3894bb3dfff5fb5487ac581f9c942a97bc (patch)
tree7095d60a6723ab845f6b5ad5dc18f5fc98403198 /cmd
parent9dd33711159888efee90ec90f22ef8a8b92c310a (diff)
downloadx-6fbb0c3894bb3dfff5fb5487ac581f9c942a97bc.tar.xz
x-6fbb0c3894bb3dfff5fb5487ac581f9c942a97bc.zip
cmd/hnscrape: assemble training data, oh god this is a bad idea
Signed-off-by: Xe Iaso <me@xeiaso.net>
Diffstat (limited to 'cmd')
-rw-r--r--cmd/hnscrape/main.go242
1 files changed, 142 insertions, 100 deletions
diff --git a/cmd/hnscrape/main.go b/cmd/hnscrape/main.go
index 78dc4f4..1c94a75 100644
--- a/cmd/hnscrape/main.go
+++ b/cmd/hnscrape/main.go
@@ -2,7 +2,10 @@ package main
import (
"context"
+ "crypto/sha256"
+ "encoding/json"
"flag"
+ "fmt"
"log/slog"
"os"
"os/signal"
@@ -10,7 +13,9 @@ import (
"syscall"
"time"
+ "jaytaylor.com/html2text"
"within.website/x/internal"
+ "within.website/x/llm"
)
var (
@@ -19,20 +24,23 @@ var (
scrapeDelay = flag.Duration("scrape-delay", 50*time.Millisecond, "Delay between scraping items")
)
+const systemMessage = `You are a commenter on the website "Hacker News". If asked for your name, you will respond with "Mimi". You should be friendly unless people are being mean to you, then you can be mean back.`
+
func main() {
internal.HandleStartup()
ctx, cancel := ControlCContext()
defer cancel()
- slog.Info("starting hnscrape", "scrapeDelay", scrapeDelay.String(), "hnUser", *hnUser)
+ slog.Debug("starting hnscrape", "scrapeDelay", scrapeDelay.String(), "hnUser", *hnUser)
hn := NewHNClient(*scrapeDelay)
if *cacheFolder != "" {
- slog.Info("caching items to", "cacheFolder", *cacheFolder)
+ slog.Debug("caching items to", "cacheFolder", *cacheFolder)
os.MkdirAll(*cacheFolder, 0755)
os.MkdirAll(filepath.Join(*cacheFolder, "items"), 0755)
os.MkdirAll(filepath.Join(*cacheFolder, "indices"), 0755)
+ os.MkdirAll(filepath.Join(*cacheFolder, "conversations"), 0755)
hn = hn.WithCacheFolder(*cacheFolder)
}
@@ -42,114 +50,141 @@ func main() {
os.Exit(1)
}
- slog.Info("got user", "user", u.Created.String(), "karma", u.Karma, "submitted", len(u.Submitted))
-
- // itemsCommentedIn := make([]int, 0)
- //
- // wg := sync.WaitGroup{}
- //
- // wg.Add(len(u.Submitted))
- //
- // for _, itemID := range u.Submitted {
- // itemID := itemID
- //
- // item, err := hn.GetItem(ctx, itemID)
- // if err != nil {
- // slog.Error("failed to get item", "err", err, "itemID", itemID)
- // continue
- // }
- //
- // slog.Debug("got item", "item", item.ID)
- //
- // go func(hn *HNClient, item *HNItem) {
- // defer wg.Done()
- //
- // if item.Type != "comment" {
- // return
- // }
- //
- // if item.Parent == nil {
- // return
- // }
- //
- // /*
- // if len(item.Kids) != 0 {
- // slog.Info("getting comment kids", "item", item.ID)
- // for _, kid := range item.Kids {
- // kid, err := hn.GetItem(ctx, kid)
- // if err != nil {
- // slog.Error("failed to get kid", "err", err, "kid", kid)
- // continue
- // }
- //
- // slog.Info("got kid", "kid", kid.ID)
- // }
- // }*/
- //
- // slog.Debug("getting comment parent", "parent", item.Parent)
- //
- // parent, err := hn.GetItem(ctx, *item.Parent)
- // if err != nil {
- // if err == context.Canceled {
- // return
- // }
- // slog.Error("failed to get parent", "err", err, "parent", item.Parent)
- // return
- // }
- //
- // slog.Debug("got parent", "parent", parent.ID)
- // /*
- // for _, kid := range parent.Kids {
- // kid, err := hn.GetItem(ctx, kid)
- // if err != nil {
- // slog.Error("failed to get kid", "err", err, "kid", kid)
- // continue
- // }
- //
- // slog.Info("got kid", "kid", kid.ID)
- // }*/
- // }(hn, item)
- //
- // /*
- // if item.Type == "comment" {
- // ultimateParent, err := hn.GetUltimateParent(ctx, item.ID)
- // if err != nil {
- // if err == context.Canceled {
- // break
- // }
- // slog.Error("failed to get ultimate parent", "err", err, "item", item.ID)
- // continue
- // }
- // itemsCommentedIn = append(itemsCommentedIn, ultimateParent.ID)
- // }
- // */
- // }
- //
- // wg.Wait()
-
- /*
- slog.Info("done", "itemsCommentedIn", len(itemsCommentedIn))
-
- fout, err := os.Create(filepath.Join(*cacheFolder, "indices", "itemsCommentedIn"))
+ slog.Debug("got user", "user", u.Created.String(), "karma", u.Karma, "submitted", len(u.Submitted))
+
+ reverseIntSlice(u.Submitted)
+
+ conversations := map[string][]int{}
+
+ for _, itemID := range u.Submitted {
+ item, err := hn.GetItem(ctx, itemID)
if err != nil {
- slog.Error("failed to create itemsCommentedIn file", "err", err)
+ slog.Error("failed to get item", "err", err, "itemID", itemID)
os.Exit(1)
}
- defer fout.Close()
- if err := json.NewEncoder(fout).Encode(itemsCommentedIn); err != nil {
- slog.Error("failed to write itemsCommentedIn", "err", err)
- os.Exit(1)
+ if item.Type != "comment" {
+ continue
+ }
+
+ if item.Parent == nil {
+ continue
+ }
+
+ parent, err := hn.GetItem(ctx, *item.Parent)
+ if err != nil {
+ slog.Error("failed to get parent", "err", err, "itemID", item.ID)
+ continue
}
- */
+ _ = parent
- pathToRoot, err := hn.PathToRoot(ctx, 40699123)
+ pathToRoot, err := hn.PathToRoot(ctx, item.ID)
+ if err != nil {
+ slog.Error("failed to get path to root", "err", err, "itemID", item.ID)
+ continue
+ }
+
+ conversationID, err := getConversationIDName(pathToRoot)
+ if err != nil {
+ slog.Error("failed to get conversation ID", "err", err, "itemID", item.ID)
+ continue
+ }
+
+ slog.Info("got conversation ID", "itemID", item.ID, "conversationID", conversationID)
+
+ conversations[conversationID] = pathToRoot
+ }
+
+ fout, err := os.Create(filepath.Join(*cacheFolder, "train.jsonl"))
if err != nil {
- slog.Error("failed to get path to root", "err", err)
+ slog.Error("failed to create train file", "err", err)
+ os.Exit(1)
+ }
+ defer fout.Close()
+
+ for conversationID, path := range conversations {
+ items := []*HNItem{}
+
+ for _, itemID := range path {
+ item, err := hn.GetItem(ctx, itemID)
+ if err != nil {
+ slog.Error("failed to get item", "err", err, "itemID", itemID)
+ os.Exit(1)
+ }
+
+ items = append(items, item)
+ }
+
+ messages := []llm.Message{}
+
+ for i, item := range items {
+ _ = i
+ text := item.Text
+ role := "user"
+
+ if item.Type == "story" {
+ role = "system"
+ text = systemMessage + "\n\n" + item.URL + ": " + item.Title
+ }
+
+ if item.By == *hnUser {
+ role = "assistant"
+ }
+
+ if role == "user" && len(items) > i+1 && items[i+1].By != *hnUser {
+ next := items[i+1]
+ next.Text = text + "\n\n" + next.Text
+ continue
+ }
+
+ plainText, err := html2text.FromString(text, html2text.Options{OmitLinks: true})
+ if err != nil {
+ slog.Error("failed to convert HTML to text", "err", err, "itemID", item.ID)
+ os.Exit(1)
+ }
+
+ messages = append(messages, llm.Message{
+ Role: role,
+ Content: plainText,
+ })
+ }
+
+ if err := json.NewEncoder(fout).Encode(messages); err != nil {
+ slog.Error("failed to write conversation", "err", err, "conversationID", conversationID)
+ os.Exit(1)
+ }
+ }
+
+ if err := json.NewEncoder(fout).Encode([]llm.Message{
+ {
+ Role: "system",
+ Content: systemMessage + "\n\nhttps://xeiaso.net: Xe Iaso",
+ },
+ {
+ Role: "user",
+ Content: "What is your name?",
+ },
+ {
+ Role: "assistant",
+ Content: "My name is Mimi, duh!",
+ },
+ }); err != nil {
+ slog.Error("failed to write conversation", "err", err)
os.Exit(1)
}
+}
+
+func getConversationIDName(path []int) (string, error) {
+ if len(path) == 0 {
+ return "", fmt.Errorf("path is empty you goofus")
+ }
- slog.Info("got path to root", "pathToRoot", pathToRoot)
+ h := sha256.New()
+ if err := json.NewEncoder(h).Encode(path); err != nil {
+ return "", fmt.Errorf("failed to encode path: %w", err)
+ }
+
+ return fmt.Sprintf("%x", h.Sum(nil)), nil
}
func ControlCContext() (context.Context, context.CancelFunc) {
@@ -166,3 +201,10 @@ func ControlCContext() (context.Context, context.CancelFunc) {
return ctx, cancel
}
+
+func reverseIntSlice(s []int) {
+ for i := len(s)/2 - 1; i >= 0; i-- {
+ opp := len(s) - 1 - i
+ s[i], s[opp] = s[opp], s[i]
+ }
+}