diff options
Diffstat (limited to 'vendor/github.com/jaytaylor/html2text/html2text.go')
| -rw-r--r-- | vendor/github.com/jaytaylor/html2text/html2text.go | 473 |
1 files changed, 0 insertions, 473 deletions
diff --git a/vendor/github.com/jaytaylor/html2text/html2text.go b/vendor/github.com/jaytaylor/html2text/html2text.go deleted file mode 100644 index fa36990..0000000 --- a/vendor/github.com/jaytaylor/html2text/html2text.go +++ /dev/null @@ -1,473 +0,0 @@ -package html2text - -import ( - "bytes" - "io" - "regexp" - "strings" - "unicode" - - "github.com/olekukonko/tablewriter" - "github.com/ssor/bom" - "golang.org/x/net/html" - "golang.org/x/net/html/atom" -) - -// Options provide toggles and overrides to control specific rendering behaviors. -type Options struct { - PrettyTables bool // Turns on pretty ASCII rendering for table elements. - OmitLinks bool // Turns on omitting links -} - -// FromHTMLNode renders text output from a pre-parsed HTML document. -func FromHTMLNode(doc *html.Node, o ...Options) (string, error) { - var options Options - if len(o) > 0 { - options = o[0] - } - - ctx := textifyTraverseContext{ - buf: bytes.Buffer{}, - options: options, - } - if err := ctx.traverse(doc); err != nil { - return "", err - } - - text := strings.TrimSpace(newlineRe.ReplaceAllString( - strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"), - ) - return text, nil -} - -// FromReader renders text output after parsing HTML for the specified -// io.Reader. -func FromReader(reader io.Reader, options ...Options) (string, error) { - newReader, err := bom.NewReaderWithoutBom(reader) - if err != nil { - return "", err - } - doc, err := html.Parse(newReader) - if err != nil { - return "", err - } - return FromHTMLNode(doc, options...) -} - -// FromString parses HTML from the input string, then renders the text form. -func FromString(input string, options ...Options) (string, error) { - bs := bom.CleanBom([]byte(input)) - text, err := FromReader(bytes.NewReader(bs), options...) - if err != nil { - return "", err - } - return text, nil -} - -var ( - spacingRe = regexp.MustCompile(`[ \r\n\t]+`) - newlineRe = regexp.MustCompile(`\n\n+`) -) - -// traverseTableCtx holds text-related context. -type textifyTraverseContext struct { - buf bytes.Buffer - - prefix string - tableCtx tableTraverseContext - options Options - endsWithSpace bool - justClosedDiv bool - blockquoteLevel int - lineLength int - isPre bool -} - -// tableTraverseContext holds table ASCII-form related context. -type tableTraverseContext struct { - header []string - body [][]string - footer []string - tmpRow int - isInFooter bool -} - -func (tableCtx *tableTraverseContext) init() { - tableCtx.body = [][]string{} - tableCtx.header = []string{} - tableCtx.footer = []string{} - tableCtx.isInFooter = false - tableCtx.tmpRow = 0 -} - -func (ctx *textifyTraverseContext) handleElement(node *html.Node) error { - ctx.justClosedDiv = false - - switch node.DataAtom { - case atom.Br: - return ctx.emit("\n") - - case atom.H1, atom.H2, atom.H3: - subCtx := textifyTraverseContext{} - if err := subCtx.traverseChildren(node); err != nil { - return err - } - - str := subCtx.buf.String() - dividerLen := 0 - for _, line := range strings.Split(str, "\n") { - if lineLen := len([]rune(line)); lineLen-1 > dividerLen { - dividerLen = lineLen - 1 - } - } - var divider string - if node.DataAtom == atom.H1 { - divider = strings.Repeat("*", dividerLen) - } else { - divider = strings.Repeat("-", dividerLen) - } - - if node.DataAtom == atom.H3 { - return ctx.emit("\n\n" + str + "\n" + divider + "\n\n") - } - return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n") - - case atom.Blockquote: - ctx.blockquoteLevel++ - ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " " - if err := ctx.emit("\n"); err != nil { - return err - } - if ctx.blockquoteLevel == 1 { - if err := ctx.emit("\n"); err != nil { - return err - } - } - if err := ctx.traverseChildren(node); err != nil { - return err - } - ctx.blockquoteLevel-- - ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) - if ctx.blockquoteLevel > 0 { - ctx.prefix += " " - } - return ctx.emit("\n\n") - - case atom.Div: - if ctx.lineLength > 0 { - if err := ctx.emit("\n"); err != nil { - return err - } - } - if err := ctx.traverseChildren(node); err != nil { - return err - } - var err error - if !ctx.justClosedDiv { - err = ctx.emit("\n") - } - ctx.justClosedDiv = true - return err - - case atom.Li: - if err := ctx.emit("* "); err != nil { - return err - } - - if err := ctx.traverseChildren(node); err != nil { - return err - } - - return ctx.emit("\n") - - case atom.B, atom.Strong: - subCtx := textifyTraverseContext{} - subCtx.endsWithSpace = true - if err := subCtx.traverseChildren(node); err != nil { - return err - } - str := subCtx.buf.String() - return ctx.emit("*" + str + "*") - - case atom.A: - linkText := "" - // For simple link element content with single text node only, peek at the link text. - if node.FirstChild != nil && node.FirstChild.NextSibling == nil && node.FirstChild.Type == html.TextNode { - linkText = node.FirstChild.Data - } - - // If image is the only child, take its alt text as the link text. - if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img { - if altText := getAttrVal(img, "alt"); altText != "" { - if err := ctx.emit(altText); err != nil { - return err - } - } - } else if err := ctx.traverseChildren(node); err != nil { - return err - } - - hrefLink := "" - if attrVal := getAttrVal(node, "href"); attrVal != "" { - attrVal = ctx.normalizeHrefLink(attrVal) - // Don't print link href if it matches link element content or if the link is empty. - if !ctx.options.OmitLinks && attrVal != "" && linkText != attrVal { - hrefLink = "( " + attrVal + " )" - } - } - - return ctx.emit(hrefLink) - - case atom.P, atom.Ul: - return ctx.paragraphHandler(node) - - case atom.Table, atom.Tfoot, atom.Th, atom.Tr, atom.Td: - if ctx.options.PrettyTables { - return ctx.handleTableElement(node) - } else if node.DataAtom == atom.Table { - return ctx.paragraphHandler(node) - } - return ctx.traverseChildren(node) - - case atom.Pre: - ctx.isPre = true - err := ctx.traverseChildren(node) - ctx.isPre = false - return err - - case atom.Style, atom.Script, atom.Head: - // Ignore the subtree. - return nil - - default: - return ctx.traverseChildren(node) - } -} - -// paragraphHandler renders node children surrounded by double newlines. -func (ctx *textifyTraverseContext) paragraphHandler(node *html.Node) error { - if err := ctx.emit("\n\n"); err != nil { - return err - } - if err := ctx.traverseChildren(node); err != nil { - return err - } - return ctx.emit("\n\n") -} - -// handleTableElement is only to be invoked when options.PrettyTables is active. -func (ctx *textifyTraverseContext) handleTableElement(node *html.Node) error { - if !ctx.options.PrettyTables { - panic("handleTableElement invoked when PrettyTables not active") - } - - switch node.DataAtom { - case atom.Table: - if err := ctx.emit("\n\n"); err != nil { - return err - } - - // Re-intialize all table context. - ctx.tableCtx.init() - - // Browse children, enriching context with table data. - if err := ctx.traverseChildren(node); err != nil { - return err - } - - buf := &bytes.Buffer{} - table := tablewriter.NewWriter(buf) - table.SetHeader(ctx.tableCtx.header) - table.SetFooter(ctx.tableCtx.footer) - table.AppendBulk(ctx.tableCtx.body) - - // Render the table using ASCII. - table.Render() - if err := ctx.emit(buf.String()); err != nil { - return err - } - - return ctx.emit("\n\n") - - case atom.Tfoot: - ctx.tableCtx.isInFooter = true - if err := ctx.traverseChildren(node); err != nil { - return err - } - ctx.tableCtx.isInFooter = false - - case atom.Tr: - ctx.tableCtx.body = append(ctx.tableCtx.body, []string{}) - if err := ctx.traverseChildren(node); err != nil { - return err - } - ctx.tableCtx.tmpRow++ - - case atom.Th: - res, err := ctx.renderEachChild(node) - if err != nil { - return err - } - - ctx.tableCtx.header = append(ctx.tableCtx.header, res) - - case atom.Td: - res, err := ctx.renderEachChild(node) - if err != nil { - return err - } - - if ctx.tableCtx.isInFooter { - ctx.tableCtx.footer = append(ctx.tableCtx.footer, res) - } else { - ctx.tableCtx.body[ctx.tableCtx.tmpRow] = append(ctx.tableCtx.body[ctx.tableCtx.tmpRow], res) - } - - } - return nil -} - -func (ctx *textifyTraverseContext) traverse(node *html.Node) error { - switch node.Type { - default: - return ctx.traverseChildren(node) - - case html.TextNode: - var data string - if ctx.isPre { - data = node.Data - } else { - data = strings.Trim(spacingRe.ReplaceAllString(node.Data, " "), " ") - } - return ctx.emit(data) - - case html.ElementNode: - return ctx.handleElement(node) - } -} - -func (ctx *textifyTraverseContext) traverseChildren(node *html.Node) error { - for c := node.FirstChild; c != nil; c = c.NextSibling { - if err := ctx.traverse(c); err != nil { - return err - } - } - - return nil -} - -func (ctx *textifyTraverseContext) emit(data string) error { - if data == "" { - return nil - } - var ( - lines = ctx.breakLongLines(data) - err error - ) - for _, line := range lines { - runes := []rune(line) - startsWithSpace := unicode.IsSpace(runes[0]) - if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") { - if err = ctx.buf.WriteByte(' '); err != nil { - return err - } - ctx.lineLength++ - } - ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) - for _, c := range line { - if _, err = ctx.buf.WriteString(string(c)); err != nil { - return err - } - ctx.lineLength++ - if c == '\n' { - ctx.lineLength = 0 - if ctx.prefix != "" { - if _, err = ctx.buf.WriteString(ctx.prefix); err != nil { - return err - } - } - } - } - } - return nil -} - -const maxLineLen = 74 - -func (ctx *textifyTraverseContext) breakLongLines(data string) []string { - // Only break lines when in blockquotes. - if ctx.blockquoteLevel == 0 { - return []string{data} - } - var ( - ret = []string{} - runes = []rune(data) - l = len(runes) - existing = ctx.lineLength - ) - if existing >= maxLineLen { - ret = append(ret, "\n") - existing = 0 - } - for l+existing > maxLineLen { - i := maxLineLen - existing - for i >= 0 && !unicode.IsSpace(runes[i]) { - i-- - } - if i == -1 { - // No spaces, so go the other way. - i = maxLineLen - existing - for i < l && !unicode.IsSpace(runes[i]) { - i++ - } - } - ret = append(ret, string(runes[:i])+"\n") - for i < l && unicode.IsSpace(runes[i]) { - i++ - } - runes = runes[i:] - l = len(runes) - existing = 0 - } - if len(runes) > 0 { - ret = append(ret, string(runes)) - } - return ret -} - -func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string { - link = strings.TrimSpace(link) - link = strings.TrimPrefix(link, "mailto:") - return link -} - -// renderEachChild visits each direct child of a node and collects the sequence of -// textuual representaitons separated by a single newline. -func (ctx *textifyTraverseContext) renderEachChild(node *html.Node) (string, error) { - buf := &bytes.Buffer{} - for c := node.FirstChild; c != nil; c = c.NextSibling { - s, err := FromHTMLNode(c, ctx.options) - if err != nil { - return "", err - } - if _, err = buf.WriteString(s); err != nil { - return "", err - } - if c.NextSibling != nil { - if err = buf.WriteByte('\n'); err != nil { - return "", err - } - } - } - return buf.String(), nil -} - -func getAttrVal(node *html.Node, attrName string) string { - for _, attr := range node.Attr { - if attr.Key == attrName { - return attr.Val - } - } - - return "" -} |
