From 703122b00b1b5440cae28b633bb6ca6bd7e921fd Mon Sep 17 00:00:00 2001 From: Young Xu Date: Sun, 22 Sep 2024 18:51:12 +0800 Subject: [PATCH] first commit Signed-off-by: Young Xu --- .gitignore | 1 + go.mod | 15 +++ go.sum | 48 +++++++++ goh.go | 158 +++++++++++++++++++++++++++ goh_test.go | 301 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 523 insertions(+) create mode 100644 .gitignore create mode 100644 go.mod create mode 100644 go.sum create mode 100644 goh.go create mode 100644 goh_test.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..485dee6 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..875bb56 --- /dev/null +++ b/go.mod @@ -0,0 +1,15 @@ +module gitter.top/common/goh + +go 1.20 + +require ( + github.com/andybalholm/cascadia v1.3.2 + github.com/stretchr/testify v1.9.0 + golang.org/x/net v0.29.0 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..995dde9 --- /dev/null +++ b/go.sum @@ -0,0 +1,48 @@ +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= +golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/goh.go b/goh.go new file mode 100644 index 0000000..0785509 --- /dev/null +++ b/goh.go @@ -0,0 +1,158 @@ +package goh + +import ( + "errors" + "io" + + "github.com/andybalholm/cascadia" + "golang.org/x/net/html" +) + +type Parser interface { + // Find the first element that matches the selector. + Find(filter string) SingleParser + // FindAll the elements that match the selector. + FindAll(name string) MultiParser +} + +type MultiParser interface { + Parser + // Attributes returns the attribute values of the elements that match + Attributes(filter, attr string) ([]string, error) + // Values returns the values of the elements that match + Values(filter string) ([]string, error) +} + +type SingleParser interface { + Parser + // Attribute returns the attribute value of the first element that matches + Attribute(attr string) (string, error) + // Value returns the value of the first element that matches + Value() (string, error) +} + +type parser struct { + doc *html.Node + docs []*html.Node + err error +} + +func NewParser(reader io.Reader) (Parser, error) { + doc, err := html.Parse(reader) + if err != nil { + return nil, err + } + return &parser{doc: doc}, nil +} + +func (p *parser) Find(filter string) SingleParser { + if p.err != nil { + return p + } + sel, err := cascadia.Parse(filter) + if err != nil { + p.err = err + return p + } + if p.doc == nil && len(p.docs) == 0 { + p.err = errors.New("no nodes found") + return p + } + if p.doc == nil && len(p.docs) > 0 { + p.doc = p.docs[0] + } + doc := cascadia.Query(p.doc, sel) + return &parser{doc: doc} +} + +func (p *parser) FindAll(filter string) MultiParser { + if p.err != nil { + return p + } + sel, err := cascadia.Parse(filter) + if err != nil { + p.err = err + return p + } + docs := cascadia.QueryAll(p.doc, sel) + return &parser{docs: docs} +} + +func (p *parser) Attribute(attr string) (string, error) { + if p.err != nil { + return "", p.err + } + if p.doc == nil { + return "", nil + } + for _, attribute := range p.doc.Attr { + if attribute.Key == attr { + return attribute.Val, nil + } + } + return "", errors.New("attribute not found") +} + +func (p *parser) Attributes(filter, attr string) ([]string, error) { + if p.err != nil { + return nil, p.err + } + if p.docs == nil { + return nil, errors.New("no nodes found") + } + var attributes []string + sel, err := cascadia.Parse(filter) + if err != nil { + return nil, err + } + for _, doc := range p.docs { + nodes := cascadia.Query(doc, sel) + if nodes == nil { + continue + } + for _, attribute := range nodes.Attr { + if attribute.Key == attr { + attributes = append(attributes, attribute.Val) + } + } + } + return attributes, nil +} + +func (p *parser) Value() (string, error) { + if p.err != nil { + return "", p.err + } + if p.doc == nil { + return "", nil + } + if p.doc.FirstChild == nil { + return "", nil + } + return p.doc.FirstChild.Data, nil +} + +func (p *parser) Values(filter string) ([]string, error) { + if p.err != nil { + return nil, p.err + } + if len(p.docs) == 0 { + return nil, errors.New("no nodes found") + } + var values []string + sel, err := cascadia.Parse(filter) + if err != nil { + return nil, err + } + for _, doc := range p.docs { + nodes := cascadia.Query(doc, sel) + if nodes == nil { + continue + } + if nodes.FirstChild == nil { + continue + } + values = append(values, nodes.FirstChild.Data) + } + return values, nil +} diff --git a/goh_test.go b/goh_test.go new file mode 100644 index 0000000..7e5691b --- /dev/null +++ b/goh_test.go @@ -0,0 +1,301 @@ +package goh + +import ( + "io" + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func testData() io.Reader { + return strings.NewReader(` + + + + The Linux Kernel Archives + + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+ This site is operated by the Linux Kernel Organization, Inc., a 501(c)3 nonprofit corporation, with support from the following sponsors. +
+ +
+ Equinix Metal +   + Fastly +   + Constellix +
+ Red Hat +   + PureStorage +   + Google +
 
+ The Linux Foundation +
+
+ + +`) +} + +func TestNewParser(t *testing.T) { + parser, err := NewParser(testData()) + assert.NoError(t, err) + value, err := parser.Find("footer#contentinfo.body address#about.vcard.body a").Value() + assert.NoError(t, err) + assert.Equal(t, "501(c)3 nonprofit corporation", value) +} + +func Test_parser_Find(t *testing.T) { + parser, err := NewParser(testData()) + assert.NoError(t, err) + value, err := parser.Find("section#extras div.blogroll h2").Value() + assert.NoError(t, err) + assert.Equal(t, "Other resources", value) + value, err = parser.Find("section#extras div.social").Attribute("class") + assert.NoError(t, err) + assert.Equal(t, "social", value) +} + +func Test_parser_FindAll(t *testing.T) { + parser, err := NewParser(testData()) + assert.NoError(t, err) + values, err := parser.FindAll("section#extras div.blogroll ul li").Values("li a") + assert.NoError(t, err) + assert.Equal(t, []string{"Git Trees", "Documentation", "Kernel Mailing Lists", "Patchwork", "Wikis", "Bugzilla", "Mirrors", "Linux.com", "Linux Foundation"}, values) + attributes, err := parser.FindAll("footer#contentinfo.body address#donors a").Attributes("a img", "alt") + assert.NoError(t, err) + assert.Equal(t, []string{"Equinix Metal", "Fastly", "Constellix", "Red Hat", "PureStorage", "Google", "The Linux Foundation"}, attributes) +} + +func Test_parser_Find_FindAll_Find(t *testing.T) { + parser, err := NewParser(testData()) + assert.NoError(t, err) + attribute, err := parser.Find("footer#contentinfo.body").FindAll("address#donors a").Find("a img").Attribute("alt") + assert.NoError(t, err) + assert.Equal(t, "Equinix Metal", attribute) +}