diff options
| author | Christine Dodrill <me@christine.website> | 2018-09-24 19:29:33 -0700 |
|---|---|---|
| committer | Christine Dodrill <me@christine.website> | 2018-09-24 19:31:51 -0700 |
| commit | dc0c9aa8ae31948ecdcddcb1ead0a6041005299d (patch) | |
| tree | 71a54597b124dda33dac736cd3dc1770d0e29797 /web | |
| parent | 198b135a250c10ddbd4734c5fcaa70cb3586610b (diff) | |
| download | x-dc0c9aa8ae31948ecdcddcb1ead0a6041005299d.tar.xz x-dc0c9aa8ae31948ecdcddcb1ead0a6041005299d.zip | |
discord/ilo-kesi: move toki pona tokenizing code into its own folder, add markov code (sitelen pakala)
Diffstat (limited to 'web')
| -rw-r--r-- | web/tokiponatokens/doc.go | 5 | ||||
| -rw-r--r-- | web/tokiponatokens/toki_pona.go | 111 | ||||
| -rw-r--r-- | web/tokiponatokens/toki_pona_test.go | 10 |
3 files changed, 126 insertions, 0 deletions
diff --git a/web/tokiponatokens/doc.go b/web/tokiponatokens/doc.go new file mode 100644 index 0000000..f5c2e9c --- /dev/null +++ b/web/tokiponatokens/doc.go @@ -0,0 +1,5 @@ +/* +Package tokiponatokens is a wrapper to a Toki Poka tokenizer. I have an instance set up here: +https://us-central1-golden-cove-408.cloudfunctions.net/function-1 +*/ +package tokiponatokens diff --git a/web/tokiponatokens/toki_pona.go b/web/tokiponatokens/toki_pona.go new file mode 100644 index 0000000..6313f21 --- /dev/null +++ b/web/tokiponatokens/toki_pona.go @@ -0,0 +1,111 @@ +package tokiponatokens + +import ( + "bytes" + "encoding/json" + "net/http" + "strings" +) + +// Part is an individual part of a sentence. +type Part struct { + Type string `json:"part"` + Sep *string `json:"sep"` + Tokens []string `json:"tokens"` + Parts []*Part `json:"parts"` +} + +func (p Part) String() string { + switch p.Type { + case PartPunctuation: + switch p.Tokens[0] { + case PunctExclamation: + return "!" + case PunctPeriod: + return "." + case PunctQuestion: + return "?" + case PunctComma: + return "," + } + + panic("unknown punctuation " + p.Tokens[0]) + case PartAddress: + if p.Parts == nil { + if p.Sep == nil { + return strings.Join(p.Tokens, " ") + } + + return strings.Title(strings.Join(p.Tokens, "")) + } + } + + var sb strings.Builder + + for _, pt := range p.Parts { + sb.WriteString(pt.String()) + sb.WriteRune(' ') + } + + if p.Sep != nil { + sb.WriteString(*p.Sep) + sb.WriteRune(' ') + } + + if len(p.Tokens) != 0 { + sb.WriteString(strings.Join(p.Tokens, " ")) + sb.WriteRune(' ') + } + + return sb.String() +} + +// Individual part type values. +const ( + // Who/what the sentence is addressed to in Parts. + PartAddress = `address` + PartSubject = `subject` + PartObjectMarker = `objectMarker` + PartPrepPhrase = `prepPhrase` + PartInterjection = `interjection` + // A foreign name. + PartCartouche = `cartouche` + // Most sentences will end in this. + PartPunctuation = `punctuation` +) + +// Punctuation constants. +const ( + PunctPeriod = `period` + PunctQuestion = `question` + PunctExclamation = `exclamation` + PunctComma = `comma` +) + +// Sentence is a series of sentence parts. This correlates to one Toki Pona sentence. +type Sentence []Part + +// Tokenize returns a series of toki pona tokens. +func Tokenize(aurl, text string) ([]Sentence, error) { + buf := bytes.NewBuffer([]byte(text)) + req, err := http.NewRequest(http.MethodPost, aurl, buf) + if err != nil { + return nil, err + } + + req.Header.Add("Content-Type", "text/plain") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + var result []Sentence + err = json.NewDecoder(resp.Body).Decode(&result) + if err != nil { + return nil, err + } + + return result, nil +} diff --git a/web/tokiponatokens/toki_pona_test.go b/web/tokiponatokens/toki_pona_test.go new file mode 100644 index 0000000..e31eb8b --- /dev/null +++ b/web/tokiponatokens/toki_pona_test.go @@ -0,0 +1,10 @@ +package tokiponatokens + +import "testing" + +func TestTokenizeTokiPona(t *testing.T) { + _, err := TokenizeTokiPona("https://us-central1-golden-cove-408.cloudfunctions.net/function-1", "mi olin e sina.") + if err != nil { + t.Fatal(err) + } +} |
