Skip to content

xpathvschopper

MIT 13 2 646
58.1 thousand (month) Jun 08 2019 (a day ago)
22 3 1 MIT
0.6.0(11 months ago) Jul 24 2014 820 (month)

xpath is a library for Go that allows you to use XPath expressions to select elements from an HTML document. It is built on top of the html package in the Go standard library, and provides a way to select elements from an HTML document using XPath expressions, which are more powerful and expressive than CSS selectors.

Chopper is a tool to extract elements from HTML by preserving ancestors and CSS rules.

Compared to other HTML parsers Chopper is designed to retain original HTML tree but eliminate elements that do not match parsing rules. Meaning, we can parse HTML elements and keep thei structure for machine learning or other tasks where data structure is needed as well as the data value.

Example Use


package main

import (
  "fmt"
  "github.com/antchfx/xpath"
  "golang.org/x/net/html"
  "strings"
)

func main() {
  // Create an HTML string
  html := `<html>
        <body>
          <div id="content">
            <p>Hello, World!</p>
            <a href="http://example.com">Example</a>
          </div>
        </body>
      </html>`

  // Parse the HTML string into a node tree
  doc, err := html.Parse(strings.NewReader(html))
  if err != nil {
    fmt.Println("Error:", err)
    return
  }

  // Compile the XPath expression
  expr, err := xpath.Compile("//p")
  if err != nil {
    fmt.Println("Error:", err)
    return
  }

  // Use the Evaluate method to select elements from the document
  nodes, err := expr.Evaluate(xpath.NodeNavigator(doc))
  if err != nil {
    fmt.Println("Error:", err)
    return
  }
  if nodes.MoveNext() {
    fmt.Println(nodes.Current().Value())
    // > Hello, World!
  }
}
HTML = """
<html>
  <head>
    <title>Test</title>
  </head>
  <body>
    <div id="header"></div>
    <div id="main">
      <div class="iwantthis">
        HELLO WORLD
        <a href="/nope">Do not want</a>
      </div>
    </div>
    <div id="footer"></div>
  </body>
</html>
"""

CSS = """
div { border: 1px solid black; }
div#main { color: blue; }
div.iwantthis { background-color: red; }
a { color: green; }
div#footer { border-top: 2px solid red; }
"""

extractor = Extractor.keep('//div[@class="iwantthis"]').discard('//a')
html, css = extractor.extract(HTML, CSS)

# will result in:
html
"""
<html>
  <body>
    <div id="main">
      <div class="iwantthis">
        HELLO WORLD
      </div>
    </div>
  </body>
</html>"""

css
"""
div{border:1px solid black;}
div#main{color:blue;}
div.iwantthis{background-color:red;}
"""

Alternatives / Similar