aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXe Iaso <me@xeiaso.net>2024-06-16 18:50:48 -0400
committerXe Iaso <me@xeiaso.net>2024-06-16 18:50:48 -0400
commit6fbb0c3894bb3dfff5fb5487ac581f9c942a97bc (patch)
tree7095d60a6723ab845f6b5ad5dc18f5fc98403198
parent9dd33711159888efee90ec90f22ef8a8b92c310a (diff)
downloadx-6fbb0c3894bb3dfff5fb5487ac581f9c942a97bc.tar.xz
x-6fbb0c3894bb3dfff5fb5487ac581f9c942a97bc.zip
cmd/hnscrape: assemble training data, oh god this is a bad idea
Signed-off-by: Xe Iaso <me@xeiaso.net>
-rw-r--r--.go.mod.sri2
-rw-r--r--cmd/hnscrape/main.go242
-rw-r--r--go.mod1
-rw-r--r--go.sum2
4 files changed, 146 insertions, 101 deletions
diff --git a/.go.mod.sri b/.go.mod.sri
index 63dda45..675cada 100644
--- a/.go.mod.sri
+++ b/.go.mod.sri
@@ -1 +1 @@
-sha256-sNob6z/emAimhdeI9xUkiPpV9lF2SeNpYNIK5JXZ51I=
+sha256-ORnkWO720g6WQy58lrUqeSb7E1JvrymJS5a/PuPYmFs=
diff --git a/cmd/hnscrape/main.go b/cmd/hnscrape/main.go
index 78dc4f4..1c94a75 100644
--- a/cmd/hnscrape/main.go
+++ b/cmd/hnscrape/main.go
@@ -2,7 +2,10 @@ package main
import (
"context"
+ "crypto/sha256"
+ "encoding/json"
"flag"
+ "fmt"
"log/slog"
"os"
"os/signal"
@@ -10,7 +13,9 @@ import (
"syscall"
"time"
+ "jaytaylor.com/html2text"
"within.website/x/internal"
+ "within.website/x/llm"
)
var (
@@ -19,20 +24,23 @@ var (
scrapeDelay = flag.Duration("scrape-delay", 50*time.Millisecond, "Delay between scraping items")
)
+const systemMessage = `You are a commenter on the website "Hacker News". If asked for your name, you will respond with "Mimi". You should be friendly unless people are being mean to you, then you can be mean back.`
+
func main() {
internal.HandleStartup()
ctx, cancel := ControlCContext()
defer cancel()
- slog.Info("starting hnscrape", "scrapeDelay", scrapeDelay.String(), "hnUser", *hnUser)
+ slog.Debug("starting hnscrape", "scrapeDelay", scrapeDelay.String(), "hnUser", *hnUser)
hn := NewHNClient(*scrapeDelay)
if *cacheFolder != "" {
- slog.Info("caching items to", "cacheFolder", *cacheFolder)
+ slog.Debug("caching items to", "cacheFolder", *cacheFolder)
os.MkdirAll(*cacheFolder, 0755)
os.MkdirAll(filepath.Join(*cacheFolder, "items"), 0755)
os.MkdirAll(filepath.Join(*cacheFolder, "indices"), 0755)
+ os.MkdirAll(filepath.Join(*cacheFolder, "conversations"), 0755)
hn = hn.WithCacheFolder(*cacheFolder)
}
@@ -42,114 +50,141 @@ func main() {
os.Exit(1)
}
- slog.Info("got user", "user", u.Created.String(), "karma", u.Karma, "submitted", len(u.Submitted))
-
- // itemsCommentedIn := make([]int, 0)
- //
- // wg := sync.WaitGroup{}
- //
- // wg.Add(len(u.Submitted))
- //
- // for _, itemID := range u.Submitted {
- // itemID := itemID
- //
- // item, err := hn.GetItem(ctx, itemID)
- // if err != nil {
- // slog.Error("failed to get item", "err", err, "itemID", itemID)
- // continue
- // }
- //
- // slog.Debug("got item", "item", item.ID)
- //
- // go func(hn *HNClient, item *HNItem) {
- // defer wg.Done()
- //
- // if item.Type != "comment" {
- // return
- // }
- //
- // if item.Parent == nil {
- // return
- // }
- //
- // /*
- // if len(item.Kids) != 0 {
- // slog.Info("getting comment kids", "item", item.ID)
- // for _, kid := range item.Kids {
- // kid, err := hn.GetItem(ctx, kid)
- // if err != nil {
- // slog.Error("failed to get kid", "err", err, "kid", kid)
- // continue
- // }
- //
- // slog.Info("got kid", "kid", kid.ID)
- // }
- // }*/
- //
- // slog.Debug("getting comment parent", "parent", item.Parent)
- //
- // parent, err := hn.GetItem(ctx, *item.Parent)
- // if err != nil {
- // if err == context.Canceled {
- // return
- // }
- // slog.Error("failed to get parent", "err", err, "parent", item.Parent)
- // return
- // }
- //
- // slog.Debug("got parent", "parent", parent.ID)
- // /*
- // for _, kid := range parent.Kids {
- // kid, err := hn.GetItem(ctx, kid)
- // if err != nil {
- // slog.Error("failed to get kid", "err", err, "kid", kid)
- // continue
- // }
- //
- // slog.Info("got kid", "kid", kid.ID)
- // }*/
- // }(hn, item)
- //
- // /*
- // if item.Type == "comment" {
- // ultimateParent, err := hn.GetUltimateParent(ctx, item.ID)
- // if err != nil {
- // if err == context.Canceled {
- // break
- // }
- // slog.Error("failed to get ultimate parent", "err", err, "item", item.ID)
- // continue
- // }
- // itemsCommentedIn = append(itemsCommentedIn, ultimateParent.ID)
- // }
- // */
- // }
- //
- // wg.Wait()
-
- /*
- slog.Info("done", "itemsCommentedIn", len(itemsCommentedIn))
-
- fout, err := os.Create(filepath.Join(*cacheFolder, "indices", "itemsCommentedIn"))
+ slog.Debug("got user", "user", u.Created.String(), "karma", u.Karma, "submitted", len(u.Submitted))
+
+ reverseIntSlice(u.Submitted)
+
+ conversations := map[string][]int{}
+
+ for _, itemID := range u.Submitted {
+ item, err := hn.GetItem(ctx, itemID)
if err != nil {
- slog.Error("failed to create itemsCommentedIn file", "err", err)
+ slog.Error("failed to get item", "err", err, "itemID", itemID)
os.Exit(1)
}
- defer fout.Close()
- if err := json.NewEncoder(fout).Encode(itemsCommentedIn); err != nil {
- slog.Error("failed to write itemsCommentedIn", "err", err)
- os.Exit(1)
+ if item.Type != "comment" {
+ continue
+ }
+
+ if item.Parent == nil {
+ continue
+ }
+
+ parent, err := hn.GetItem(ctx, *item.Parent)
+ if err != nil {
+ slog.Error("failed to get parent", "err", err, "itemID", item.ID)
+ continue
}
- */
+ _ = parent
- pathToRoot, err := hn.PathToRoot(ctx, 40699123)
+ pathToRoot, err := hn.PathToRoot(ctx, item.ID)
+ if err != nil {
+ slog.Error("failed to get path to root", "err", err, "itemID", item.ID)
+ continue
+ }
+
+ conversationID, err := getConversationIDName(pathToRoot)
+ if err != nil {
+ slog.Error("failed to get conversation ID", "err", err, "itemID", item.ID)
+ continue
+ }
+
+ slog.Info("got conversation ID", "itemID", item.ID, "conversationID", conversationID)
+
+ conversations[conversationID] = pathToRoot
+ }
+
+ fout, err := os.Create(filepath.Join(*cacheFolder, "train.jsonl"))
if err != nil {
- slog.Error("failed to get path to root", "err", err)
+ slog.Error("failed to create train file", "err", err)
+ os.Exit(1)
+ }
+ defer fout.Close()
+
+ for conversationID, path := range conversations {
+ items := []*HNItem{}
+
+ for _, itemID := range path {
+ item, err := hn.GetItem(ctx, itemID)
+ if err != nil {
+ slog.Error("failed to get item", "err", err, "itemID", itemID)
+ os.Exit(1)
+ }
+
+ items = append(items, item)
+ }
+
+ messages := []llm.Message{}
+
+ for i, item := range items {
+ _ = i
+ text := item.Text
+ role := "user"
+
+ if item.Type == "story" {
+ role = "system"
+ text = systemMessage + "\n\n" + item.URL + ": " + item.Title
+ }
+
+ if item.By == *hnUser {
+ role = "assistant"
+ }
+
+ if role == "user" && len(items) > i+1 && items[i+1].By != *hnUser {
+ next := items[i+1]
+ next.Text = text + "\n\n" + next.Text
+ continue
+ }
+
+ plainText, err := html2text.FromString(text, html2text.Options{OmitLinks: true})
+ if err != nil {
+ slog.Error("failed to convert HTML to text", "err", err, "itemID", item.ID)
+ os.Exit(1)
+ }
+
+ messages = append(messages, llm.Message{
+ Role: role,
+ Content: plainText,
+ })
+ }
+
+ if err := json.NewEncoder(fout).Encode(messages); err != nil {
+ slog.Error("failed to write conversation", "err", err, "conversationID", conversationID)
+ os.Exit(1)
+ }
+ }
+
+ if err := json.NewEncoder(fout).Encode([]llm.Message{
+ {
+ Role: "system",
+ Content: systemMessage + "\n\nhttps://xeiaso.net: Xe Iaso",
+ },
+ {
+ Role: "user",
+ Content: "What is your name?",
+ },
+ {
+ Role: "assistant",
+ Content: "My name is Mimi, duh!",
+ },
+ }); err != nil {
+ slog.Error("failed to write conversation", "err", err)
os.Exit(1)
}
+}
+
+func getConversationIDName(path []int) (string, error) {
+ if len(path) == 0 {
+ return "", fmt.Errorf("path is empty you goofus")
+ }
- slog.Info("got path to root", "pathToRoot", pathToRoot)
+ h := sha256.New()
+ if err := json.NewEncoder(h).Encode(path); err != nil {
+ return "", fmt.Errorf("failed to encode path: %w", err)
+ }
+
+ return fmt.Sprintf("%x", h.Sum(nil)), nil
}
func ControlCContext() (context.Context, context.CancelFunc) {
@@ -166,3 +201,10 @@ func ControlCContext() (context.Context, context.CancelFunc) {
return ctx, cancel
}
+
+func reverseIntSlice(s []int) {
+ for i := len(s)/2 - 1; i >= 0; i-- {
+ opp := len(s) - 1 - i
+ s[i], s[opp] = s[opp], s[i]
+ }
+}
diff --git a/go.mod b/go.mod
index fe11d3d..5ecbfcc 100644
--- a/go.mod
+++ b/go.mod
@@ -225,6 +225,7 @@ require (
google.golang.org/genproto/googleapis/rpc v0.0.0-20240318140521-94a12d6c2237 // indirect
gopkg.in/warnings.v0 v0.1.2 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
+ jaytaylor.com/html2text v0.0.0-20230321000545-74c2419ad056 // indirect
lukechampine.com/blake3 v1.2.1 // indirect
modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 // indirect
modernc.org/libc v1.50.9 // indirect
diff --git a/go.sum b/go.sum
index c61095d..5f3597e 100644
--- a/go.sum
+++ b/go.sum
@@ -1657,6 +1657,8 @@ honnef.co/go/transmission v0.0.0-20200712215954-58262f0ca9c9 h1:gvbgV/dEfmGblqVW
honnef.co/go/transmission v0.0.0-20200712215954-58262f0ca9c9/go.mod h1:2zVSJxgSdzVCl199AZjjT78vllJowC9UnQJ5l3X5e0M=
howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM=
howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g=
+jaytaylor.com/html2text v0.0.0-20230321000545-74c2419ad056 h1:6YFJoB+0fUH6X3xU/G2tQqCYg+PkGtnZ5nMR5rpw72g=
+jaytaylor.com/html2text v0.0.0-20230321000545-74c2419ad056/go.mod h1:OxvTsCwKosqQ1q7B+8FwXqg4rKZ/UG9dUW+g/VL2xH4=
lukechampine.com/blake3 v1.2.1 h1:YuqqRuaqsGV71BV/nm9xlI0MKUv4QC54jQnBChWbGnI=
lukechampine.com/blake3 v1.2.1/go.mod h1:0OFRp7fBtAylGVCO40o87sbupkyIGgbpv1+M1k1LM6k=
modernc.org/cc/v4 v4.21.2 h1:dycHFB/jDc3IyacKipCNSDrjIC0Lm1hyoWOZTRR20Lk=