aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/jaytaylor/html2text/html2text.go
diff options
context:
space:
mode:
authorChristine Dodrill <me@christine.website>2018-10-19 06:24:23 -0700
committerChristine Dodrill <me@christine.website>2018-10-19 06:28:37 -0700
commit273b48a8b409126e14f8beb23cee4255d1f08a2c (patch)
treefeb316d464d92e143732ca3eac525fe4ca3cda1d /vendor/github.com/jaytaylor/html2text/html2text.go
parent456deb2bbadbcc8fd218a2297ac4879c069ef0ba (diff)
downloadx-273b48a8b409126e14f8beb23cee4255d1f08a2c.tar.xz
x-273b48a8b409126e14f8beb23cee4255d1f08a2c.zip
GOPROXY means we can avoid vendoring
Diffstat (limited to 'vendor/github.com/jaytaylor/html2text/html2text.go')
-rw-r--r--vendor/github.com/jaytaylor/html2text/html2text.go473
1 files changed, 0 insertions, 473 deletions
diff --git a/vendor/github.com/jaytaylor/html2text/html2text.go b/vendor/github.com/jaytaylor/html2text/html2text.go
deleted file mode 100644
index fa36990..0000000
--- a/vendor/github.com/jaytaylor/html2text/html2text.go
+++ /dev/null
@@ -1,473 +0,0 @@
-package html2text
-
-import (
- "bytes"
- "io"
- "regexp"
- "strings"
- "unicode"
-
- "github.com/olekukonko/tablewriter"
- "github.com/ssor/bom"
- "golang.org/x/net/html"
- "golang.org/x/net/html/atom"
-)
-
-// Options provide toggles and overrides to control specific rendering behaviors.
-type Options struct {
- PrettyTables bool // Turns on pretty ASCII rendering for table elements.
- OmitLinks bool // Turns on omitting links
-}
-
-// FromHTMLNode renders text output from a pre-parsed HTML document.
-func FromHTMLNode(doc *html.Node, o ...Options) (string, error) {
- var options Options
- if len(o) > 0 {
- options = o[0]
- }
-
- ctx := textifyTraverseContext{
- buf: bytes.Buffer{},
- options: options,
- }
- if err := ctx.traverse(doc); err != nil {
- return "", err
- }
-
- text := strings.TrimSpace(newlineRe.ReplaceAllString(
- strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"),
- )
- return text, nil
-}
-
-// FromReader renders text output after parsing HTML for the specified
-// io.Reader.
-func FromReader(reader io.Reader, options ...Options) (string, error) {
- newReader, err := bom.NewReaderWithoutBom(reader)
- if err != nil {
- return "", err
- }
- doc, err := html.Parse(newReader)
- if err != nil {
- return "", err
- }
- return FromHTMLNode(doc, options...)
-}
-
-// FromString parses HTML from the input string, then renders the text form.
-func FromString(input string, options ...Options) (string, error) {
- bs := bom.CleanBom([]byte(input))
- text, err := FromReader(bytes.NewReader(bs), options...)
- if err != nil {
- return "", err
- }
- return text, nil
-}
-
-var (
- spacingRe = regexp.MustCompile(`[ \r\n\t]+`)
- newlineRe = regexp.MustCompile(`\n\n+`)
-)
-
-// traverseTableCtx holds text-related context.
-type textifyTraverseContext struct {
- buf bytes.Buffer
-
- prefix string
- tableCtx tableTraverseContext
- options Options
- endsWithSpace bool
- justClosedDiv bool
- blockquoteLevel int
- lineLength int
- isPre bool
-}
-
-// tableTraverseContext holds table ASCII-form related context.
-type tableTraverseContext struct {
- header []string
- body [][]string
- footer []string
- tmpRow int
- isInFooter bool
-}
-
-func (tableCtx *tableTraverseContext) init() {
- tableCtx.body = [][]string{}
- tableCtx.header = []string{}
- tableCtx.footer = []string{}
- tableCtx.isInFooter = false
- tableCtx.tmpRow = 0
-}
-
-func (ctx *textifyTraverseContext) handleElement(node *html.Node) error {
- ctx.justClosedDiv = false
-
- switch node.DataAtom {
- case atom.Br:
- return ctx.emit("\n")
-
- case atom.H1, atom.H2, atom.H3:
- subCtx := textifyTraverseContext{}
- if err := subCtx.traverseChildren(node); err != nil {
- return err
- }
-
- str := subCtx.buf.String()
- dividerLen := 0
- for _, line := range strings.Split(str, "\n") {
- if lineLen := len([]rune(line)); lineLen-1 > dividerLen {
- dividerLen = lineLen - 1
- }
- }
- var divider string
- if node.DataAtom == atom.H1 {
- divider = strings.Repeat("*", dividerLen)
- } else {
- divider = strings.Repeat("-", dividerLen)
- }
-
- if node.DataAtom == atom.H3 {
- return ctx.emit("\n\n" + str + "\n" + divider + "\n\n")
- }
- return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n")
-
- case atom.Blockquote:
- ctx.blockquoteLevel++
- ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " "
- if err := ctx.emit("\n"); err != nil {
- return err
- }
- if ctx.blockquoteLevel == 1 {
- if err := ctx.emit("\n"); err != nil {
- return err
- }
- }
- if err := ctx.traverseChildren(node); err != nil {
- return err
- }
- ctx.blockquoteLevel--
- ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel)
- if ctx.blockquoteLevel > 0 {
- ctx.prefix += " "
- }
- return ctx.emit("\n\n")
-
- case atom.Div:
- if ctx.lineLength > 0 {
- if err := ctx.emit("\n"); err != nil {
- return err
- }
- }
- if err := ctx.traverseChildren(node); err != nil {
- return err
- }
- var err error
- if !ctx.justClosedDiv {
- err = ctx.emit("\n")
- }
- ctx.justClosedDiv = true
- return err
-
- case atom.Li:
- if err := ctx.emit("* "); err != nil {
- return err
- }
-
- if err := ctx.traverseChildren(node); err != nil {
- return err
- }
-
- return ctx.emit("\n")
-
- case atom.B, atom.Strong:
- subCtx := textifyTraverseContext{}
- subCtx.endsWithSpace = true
- if err := subCtx.traverseChildren(node); err != nil {
- return err
- }
- str := subCtx.buf.String()
- return ctx.emit("*" + str + "*")
-
- case atom.A:
- linkText := ""
- // For simple link element content with single text node only, peek at the link text.
- if node.FirstChild != nil && node.FirstChild.NextSibling == nil && node.FirstChild.Type == html.TextNode {
- linkText = node.FirstChild.Data
- }
-
- // If image is the only child, take its alt text as the link text.
- if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img {
- if altText := getAttrVal(img, "alt"); altText != "" {
- if err := ctx.emit(altText); err != nil {
- return err
- }
- }
- } else if err := ctx.traverseChildren(node); err != nil {
- return err
- }
-
- hrefLink := ""
- if attrVal := getAttrVal(node, "href"); attrVal != "" {
- attrVal = ctx.normalizeHrefLink(attrVal)
- // Don't print link href if it matches link element content or if the link is empty.
- if !ctx.options.OmitLinks && attrVal != "" && linkText != attrVal {
- hrefLink = "( " + attrVal + " )"
- }
- }
-
- return ctx.emit(hrefLink)
-
- case atom.P, atom.Ul:
- return ctx.paragraphHandler(node)
-
- case atom.Table, atom.Tfoot, atom.Th, atom.Tr, atom.Td:
- if ctx.options.PrettyTables {
- return ctx.handleTableElement(node)
- } else if node.DataAtom == atom.Table {
- return ctx.paragraphHandler(node)
- }
- return ctx.traverseChildren(node)
-
- case atom.Pre:
- ctx.isPre = true
- err := ctx.traverseChildren(node)
- ctx.isPre = false
- return err
-
- case atom.Style, atom.Script, atom.Head:
- // Ignore the subtree.
- return nil
-
- default:
- return ctx.traverseChildren(node)
- }
-}
-
-// paragraphHandler renders node children surrounded by double newlines.
-func (ctx *textifyTraverseContext) paragraphHandler(node *html.Node) error {
- if err := ctx.emit("\n\n"); err != nil {
- return err
- }
- if err := ctx.traverseChildren(node); err != nil {
- return err
- }
- return ctx.emit("\n\n")
-}
-
-// handleTableElement is only to be invoked when options.PrettyTables is active.
-func (ctx *textifyTraverseContext) handleTableElement(node *html.Node) error {
- if !ctx.options.PrettyTables {
- panic("handleTableElement invoked when PrettyTables not active")
- }
-
- switch node.DataAtom {
- case atom.Table:
- if err := ctx.emit("\n\n"); err != nil {
- return err
- }
-
- // Re-intialize all table context.
- ctx.tableCtx.init()
-
- // Browse children, enriching context with table data.
- if err := ctx.traverseChildren(node); err != nil {
- return err
- }
-
- buf := &bytes.Buffer{}
- table := tablewriter.NewWriter(buf)
- table.SetHeader(ctx.tableCtx.header)
- table.SetFooter(ctx.tableCtx.footer)
- table.AppendBulk(ctx.tableCtx.body)
-
- // Render the table using ASCII.
- table.Render()
- if err := ctx.emit(buf.String()); err != nil {
- return err
- }
-
- return ctx.emit("\n\n")
-
- case atom.Tfoot:
- ctx.tableCtx.isInFooter = true
- if err := ctx.traverseChildren(node); err != nil {
- return err
- }
- ctx.tableCtx.isInFooter = false
-
- case atom.Tr:
- ctx.tableCtx.body = append(ctx.tableCtx.body, []string{})
- if err := ctx.traverseChildren(node); err != nil {
- return err
- }
- ctx.tableCtx.tmpRow++
-
- case atom.Th:
- res, err := ctx.renderEachChild(node)
- if err != nil {
- return err
- }
-
- ctx.tableCtx.header = append(ctx.tableCtx.header, res)
-
- case atom.Td:
- res, err := ctx.renderEachChild(node)
- if err != nil {
- return err
- }
-
- if ctx.tableCtx.isInFooter {
- ctx.tableCtx.footer = append(ctx.tableCtx.footer, res)
- } else {
- ctx.tableCtx.body[ctx.tableCtx.tmpRow] = append(ctx.tableCtx.body[ctx.tableCtx.tmpRow], res)
- }
-
- }
- return nil
-}
-
-func (ctx *textifyTraverseContext) traverse(node *html.Node) error {
- switch node.Type {
- default:
- return ctx.traverseChildren(node)
-
- case html.TextNode:
- var data string
- if ctx.isPre {
- data = node.Data
- } else {
- data = strings.Trim(spacingRe.ReplaceAllString(node.Data, " "), " ")
- }
- return ctx.emit(data)
-
- case html.ElementNode:
- return ctx.handleElement(node)
- }
-}
-
-func (ctx *textifyTraverseContext) traverseChildren(node *html.Node) error {
- for c := node.FirstChild; c != nil; c = c.NextSibling {
- if err := ctx.traverse(c); err != nil {
- return err
- }
- }
-
- return nil
-}
-
-func (ctx *textifyTraverseContext) emit(data string) error {
- if data == "" {
- return nil
- }
- var (
- lines = ctx.breakLongLines(data)
- err error
- )
- for _, line := range lines {
- runes := []rune(line)
- startsWithSpace := unicode.IsSpace(runes[0])
- if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") {
- if err = ctx.buf.WriteByte(' '); err != nil {
- return err
- }
- ctx.lineLength++
- }
- ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1])
- for _, c := range line {
- if _, err = ctx.buf.WriteString(string(c)); err != nil {
- return err
- }
- ctx.lineLength++
- if c == '\n' {
- ctx.lineLength = 0
- if ctx.prefix != "" {
- if _, err = ctx.buf.WriteString(ctx.prefix); err != nil {
- return err
- }
- }
- }
- }
- }
- return nil
-}
-
-const maxLineLen = 74
-
-func (ctx *textifyTraverseContext) breakLongLines(data string) []string {
- // Only break lines when in blockquotes.
- if ctx.blockquoteLevel == 0 {
- return []string{data}
- }
- var (
- ret = []string{}
- runes = []rune(data)
- l = len(runes)
- existing = ctx.lineLength
- )
- if existing >= maxLineLen {
- ret = append(ret, "\n")
- existing = 0
- }
- for l+existing > maxLineLen {
- i := maxLineLen - existing
- for i >= 0 && !unicode.IsSpace(runes[i]) {
- i--
- }
- if i == -1 {
- // No spaces, so go the other way.
- i = maxLineLen - existing
- for i < l && !unicode.IsSpace(runes[i]) {
- i++
- }
- }
- ret = append(ret, string(runes[:i])+"\n")
- for i < l && unicode.IsSpace(runes[i]) {
- i++
- }
- runes = runes[i:]
- l = len(runes)
- existing = 0
- }
- if len(runes) > 0 {
- ret = append(ret, string(runes))
- }
- return ret
-}
-
-func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string {
- link = strings.TrimSpace(link)
- link = strings.TrimPrefix(link, "mailto:")
- return link
-}
-
-// renderEachChild visits each direct child of a node and collects the sequence of
-// textuual representaitons separated by a single newline.
-func (ctx *textifyTraverseContext) renderEachChild(node *html.Node) (string, error) {
- buf := &bytes.Buffer{}
- for c := node.FirstChild; c != nil; c = c.NextSibling {
- s, err := FromHTMLNode(c, ctx.options)
- if err != nil {
- return "", err
- }
- if _, err = buf.WriteString(s); err != nil {
- return "", err
- }
- if c.NextSibling != nil {
- if err = buf.WriteByte('\n'); err != nil {
- return "", err
- }
- }
- }
- return buf.String(), nil
-}
-
-func getAttrVal(node *html.Node, attrName string) string {
- for _, attr := range node.Attr {
- if attr.Key == attrName {
- return attr.Val
- }
- }
-
- return ""
-}