diff --git a/.vscode/settings.json b/.vscode/settings.json index b1cb189..e1be600 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,7 +1,9 @@ { "cSpell.words": [ + "chromedp", "godotenv", "Puerkito", - "splitted" + "splitted", + "syns" ] } \ No newline at end of file diff --git a/go.mod b/go.mod index c114a61..bf40036 100644 --- a/go.mod +++ b/go.mod @@ -18,8 +18,8 @@ require ( github.com/bytedance/sonic v1.8.0 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect - github.com/chromedp/cdproto v0.0.0-20230625224106-7fafe342e117 // indirect - github.com/chromedp/chromedp v0.9.1 // indirect + github.com/chromedp/cdproto v0.0.0-20240127002248-bd7a66284627 // indirect + github.com/chromedp/chromedp v0.9.3 // indirect github.com/chromedp/sysutil v1.0.0 // indirect github.com/geziyor/geziyor v0.0.0-20230315135110-a242b58aaa65 // indirect github.com/gin-contrib/sse v0.1.0 // indirect @@ -30,7 +30,7 @@ require ( github.com/gobwas/glob v0.2.3 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect - github.com/gobwas/ws v1.2.1 // indirect + github.com/gobwas/ws v1.3.2 // indirect github.com/goccy/go-json v0.10.0 // indirect github.com/gocolly/colly v1.2.0 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect @@ -59,7 +59,7 @@ require ( golang.org/x/arch v0.0.0-20210923205945-b76863e36670 // indirect golang.org/x/crypto v0.10.0 // indirect golang.org/x/net v0.11.0 // indirect - golang.org/x/sys v0.9.0 // indirect + golang.org/x/sys v0.16.0 // indirect golang.org/x/text v0.10.0 // indirect golang.org/x/time v0.3.0 // indirect google.golang.org/appengine v1.6.7 // indirect diff --git a/go.sum b/go.sum index 84fa0df..6ee4d17 100644 --- a/go.sum +++ b/go.sum @@ -99,9 +99,14 @@ github.com/chromedp/cdproto v0.0.0-20220428002153-285dfb42699c/go.mod h1:5Y4sD/e github.com/chromedp/cdproto v0.0.0-20230220211738-2b1ec77315c9/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= github.com/chromedp/cdproto v0.0.0-20230625224106-7fafe342e117 h1:b++oYK7VpsjAVHJNpbhfNrKyCej4dEKIk+I22vDo4RE= github.com/chromedp/cdproto v0.0.0-20230625224106-7fafe342e117/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/cdproto v0.0.0-20231011050154-1d073bb38998/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/cdproto v0.0.0-20240127002248-bd7a66284627 h1:L5rJ/yzLfSU3kcjsjq11xYDqAdianisL21CXQ/08Zag= +github.com/chromedp/cdproto v0.0.0-20240127002248-bd7a66284627/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= github.com/chromedp/chromedp v0.8.0/go.mod h1:odCVV9o9i7HUKwHMFz9Y7T6s4Kbcz4GOyPlwKWopI9Q= github.com/chromedp/chromedp v0.9.1 h1:CC7cC5p1BeLiiS2gfNNPwp3OaUxtRMBjfiw3E3k6dFA= github.com/chromedp/chromedp v0.9.1/go.mod h1:DUgZWRvYoEfgi66CgZ/9Yv+psgi+Sksy5DTScENWjaQ= +github.com/chromedp/chromedp v0.9.3 h1:Wq58e0dZOdHsxaj9Owmfcf+ibtpYN1N0FWVbaxa/esg= +github.com/chromedp/chromedp v0.9.3/go.mod h1:NipeUkUcuzIdFbBP8eNNvl9upcceOfWzoJn6cRe4ksA= github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic= github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= @@ -189,6 +194,9 @@ github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6Wezm github.com/gobwas/ws v1.1.0/go.mod h1:nzvNcVha5eUziGrbxFCo6qFIojQHjJV5cLYIbezhfL0= github.com/gobwas/ws v1.2.1 h1:F2aeBZrm2NDsc7vbovKrWSogd4wvfAxg0FQ89/iqOTk= github.com/gobwas/ws v1.2.1/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= +github.com/gobwas/ws v1.3.0/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= +github.com/gobwas/ws v1.3.2 h1:zlnbNHxumkRvfPWgfXu8RBwyNR1x8wh9cf5PTOCqs9Q= +github.com/gobwas/ws v1.3.2/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= github.com/goccy/go-json v0.9.7/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/goccy/go-json v0.10.0 h1:mXKd9Qw4NuzShiRlOXKews24ufknHO7gx30lsDyokKA= github.com/goccy/go-json v0.10.0/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= @@ -720,6 +728,8 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s= golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= diff --git a/scrapper/thesaurus.go b/scrapper/thesaurus.go index 666a402..9518022 100644 --- a/scrapper/thesaurus.go +++ b/scrapper/thesaurus.go @@ -1,15 +1,13 @@ package scrapper import ( - "errors" + "context" "fmt" "log" - "net/http" "strings" + "time" - "github.com/PuerkitoBio/goquery" - "github.com/geziyor/geziyor" - "github.com/geziyor/geziyor/client" + "github.com/chromedp/chromedp" ) type WordResponse struct { @@ -29,189 +27,177 @@ func GetResult(word string) (WordResponse, error) { var err error // temp PoS and Def - tempPoS := []string{} - tempDef := []string{} - - geziyor.NewGeziyor(&geziyor.Options{ - // StartRequestsFunc: func(g *geziyor.Geziyor) { - // g.GetRendered("https://www.thesaurus.com/browse/"+word, g.Opt.ParseFunc) - // }, - StartURLs: []string{"https://www.thesaurus.com/browse/" + word}, - ParseFunc: func(g *geziyor.Geziyor, r *client.Response) { - - if r.StatusCode != http.StatusOK { - fmt.Println("There was an error, ", r.Status) - err = fmt.Errorf("%s", r.Status) - } - - // fmt.Println(string(r.Body)) - - root := r.HTMLDoc.Find("[data-type='thesaurus-entry-module']") - - fmt.Println("roost") - fmt.Println(root.Length()) - - // find the parts of speech with definitions - tabList := root.Find("[data-type='thesaurus-entry-tablist']") - - fmt.Println(tabList.Length()) - - tabList.Find("li").Each(func(i int, s *goquery.Selection) { - fmt.Println(s.Text()) - whole := s.Text() - pos := s.Find("em").Text() - def := strings.TrimLeft(strings.ReplaceAll(whole, pos, ""), " ") - - tempPoS = append(tempPoS, pos) - tempDef = append(tempDef, def) - - fmt.Println(def) - fmt.Println(pos) - - }) - - singleGroup := []string{} - - card := root.Find("[data-type='thesaurus-synonyms-card']") - - card.Find("li").Each(func(i int, s *goquery.Selection) { - fmt.Println(s.Text()) - sn := strings.TrimSpace(strings.ReplaceAll(s.Text(), "\n", " ")) - if len(sn) > 0 { - singleGroup = append(singleGroup, sn) - } - }) - - singleSynonymObj := Synonym{} - - if len(tempDef) > 0 { - singleSynonymObj.Definition = tempDef[0] - singleSynonymObj.PartsOfSpeech = tempPoS[0] - singleSynonymObj.Syns = singleGroup - finalResult.Synonyms = append(finalResult.Synonyms, singleSynonymObj) - - } - - // now find the antonyms - antonyms := []string{} - aCard := root.Find("[data-type='thesaurus-antonyms-card']") - fmt.Println(aCard.Length()) - aCard.Find("li").Each(func(i int, s *goquery.Selection) { - an := strings.TrimSpace(strings.ReplaceAll(s.Text(), "\n", " ")) - - if len(an) > 0 { - antonyms = append(antonyms, an) - } - }) - finalResult.Antonyms = antonyms - }, - //BrowserEndpoint: "ws://localhost:3000", - }).Start() - - return finalResult, err + // tempPoS := []string{} + // tempDef := []string{} + StartURLs := "https://www.thesaurus.com/browse/" + word + + ctx, cancel := chromedp.NewExecAllocator(context.Background(), append(chromedp.DefaultExecAllocatorOptions[:], chromedp.Flag("headless", false))...) + defer cancel() + ctx, cancel = chromedp.NewContext(ctx) + defer cancel() + + // run task list + + err = chromedp.Run(ctx, + chromedp.Navigate(StartURLs), + // chromedp.WaitVisible("body"), + chromedp.Sleep(500*time.Millisecond), + ) + if err != nil { + fmt.Println(err) + return finalResult, err + } - // Request the HTML page. - res, err := http.Get("https://www.thesaurus.com/browse/" + word) + checkRootXpath := "/html/body/div[1]/div/main/div[2]/div[2]/div[1]/section/div/h1" + + // Execute JavaScript in the browser context to get total number of elements matching the XPath expression + var checkRoot int + err = chromedp.Run(ctx, chromedp.Evaluate(fmt.Sprintf(`(function() { + var elements = document.evaluate('%s', document, null, XPathResult.ANY_TYPE, null); + var length = 0; + while (elements.iterateNext()) { + length++; + } + return length; + })()`, checkRootXpath), &checkRoot)) + if err != nil { - log.Fatal(err) + fmt.Println(err) + return finalResult, err } - fmt.Println("=========body==========") - fmt.Println(res.Status) + if checkRoot == 0 { + fmt.Println("Root not found") + return finalResult, nil + } - defer res.Body.Close() - if res.StatusCode != 200 { + // check total parts of speech + totalPOSXpath := "/html/body/div[1]/div/main/div[2]/div[2]/div[2]/section/div[@data-type=\"synonym-and-antonym-card\"]" - return finalResult, errors.New(res.Status) - // log.Fatalf("status code error: %d %s", res.StatusCode, res.Status) - } + // Execute JavaScript in the browser context to get total number of elements matching the XPath expression + var totalPOSLength int + err = chromedp.Run(ctx, chromedp.Evaluate(fmt.Sprintf(`(function() { + var elements = document.evaluate('%s', document, null, XPathResult.ANY_TYPE, null); + var length = 0; + while (elements.iterateNext()) { + length++; + } + return length; + })()`, totalPOSXpath), &totalPOSLength)) - // Load the HTML document - doc, err := goquery.NewDocumentFromReader(res.Body) if err != nil { - return finalResult, err - // log.Fatal(err) + fmt.Println("No synonym found") + return finalResult, nil } - container := doc.Filter(".wjLcgFJpqs9M6QJsPf5v") + if totalPOSLength == 0 { + fmt.Println("No synonym found") + return finalResult, nil + } - fmt.Println(container.Length()) + fmt.Println("Total POS length:", totalPOSLength) + + for i := 0; i < totalPOSLength; i++ { + var currentPos Synonym + var syns []string + // iterate over all the POS + synonymRoot := fmt.Sprintf("/html/body/div[1]/div/main/div[2]/div[2]/div[2]/section/div[@data-type=\"synonym-and-antonym-card\"][%v]/div[2]/div[2]/div", i+1) + + // Execute JavaScript in the browser context to get total number of elements matching the XPath expression + var totalSynonymLength int + err = chromedp.Run(ctx, chromedp.Evaluate(fmt.Sprintf(`(function() { + var elements = document.evaluate('%s', document, null, XPathResult.ANY_TYPE, null); + var length = 0; + while (elements.iterateNext()) { + length++; + } + return length; + })()`, synonymRoot), &totalSynonymLength)) + + if err != nil { + fmt.Println(err) + continue + } - // container := doc.Find(".MainContentContainer") + if totalSynonymLength == 0 { + fmt.Println("No synonym found") + return finalResult, err + } - // inside MainContentContainer - // first ul parts of speech with definition - // second ul synonyms - // and followed by more synonyms for parts of speech - // inside #antonyms the ul is the antonyms + fmt.Println("Total synonym length:", totalSynonymLength) - // check if definition is available or not - defs := container.Find(".ew5makj1") - // defs := container.Find("ul:first-child") + posXpath := fmt.Sprintf(" /html/body/div[1]/div/main/div[2]/div[2]/div[2]/section/div[@data-type=\"synonym-and-antonym-card\"][%v]/div[1]/p", i+1) + // get the pos and definition + var posString string + _ = chromedp.Run(ctx, + chromedp.Evaluate(fmt.Sprintf(`document.evaluate('%s',document,null,XPathResult.FIRST_ORDERED_NODE_TYPE,null,).singleNodeValue?.textContent`, posXpath), &posString)) - if defs.Length() == 0 { - fmt.Println("No definition available") - return finalResult, nil - } + poss := strings.Split(posString, " as in ") - // not get the parts of speech - defs.Each(func(i int, s *goquery.Selection) { - // find parts of speech - // fmt.Println("parts of speech", s.Find("em").Text()) - tempPoS = append(tempPoS, s.Find("em").Text()) - // fmt.Println("meaning", s.Find("strong").Text()) - tempDef = append(tempDef, s.Find("strong").Text()) - }) - - // now find the synonyms and antonyms - - // len := container.Find("ul.e1ccqdb60").Length() - // synonyms := container.Find("ul.e1ccqdb60").First().Find("li").Each(func(i int, s *goquery.Selection) { - // fmt.Println(s.Find("a").Text()) - // }) - - // synonyms := [][]string{} - singleSynonymObj := Synonym{} - - // check if second synonym is available - for i := 0; i < defs.Length(); i++ { - singleGroup := []string{} - container.Find("ul").Eq(i + 1).Find("li").Each(func(i int, s *goquery.Selection) { - // fmt.Println(s.Find("a").Text()) - sn := strings.TrimSpace(strings.ReplaceAll(s.Find("a").Text(), "\n", " ")) - if len(sn) > 0 { - singleGroup = append(singleGroup, sn) - } - - }) - singleSynonymObj.Definition = tempDef[i] - singleSynonymObj.PartsOfSpeech = tempPoS[i] - singleSynonymObj.Syns = singleGroup + if len(poss) == 2 { + currentPos.Definition = strings.TrimSpace(poss[1]) + currentPos.PartsOfSpeech = poss[0] + } - finalResult.Synonyms = append(finalResult.Synonyms, singleSynonymObj) + fmt.Println(posString) + + for j := 0; j < totalSynonymLength; j++ { + // Define your XPath expression + xpathExpression := fmt.Sprintf("/html/body/div[1]/div/main/div[2]/div[2]/div[2]/section/div[@data-type=\"synonym-and-antonym-card\"][%v]/div[2]/div[2]/div[%v]/ul/li", i+1, j+1) + + // Check if the XPath expression is valid + var isValid bool + err = chromedp.Run(ctx, chromedp.Evaluate(fmt.Sprintf(`(function() { + try { + document.evaluate('%s', document, null, XPathResult.ANY_TYPE, null); + return true; + } catch (e) { + return false; + } + })()`, xpathExpression), &isValid)) + + if err != nil { + fmt.Println(err) + continue + } - // synonyms = append(synonyms, singleGroup) - } + if !isValid { + log.Printf("Invalid XPath expression") + continue + } - // fmt.Println(synonyms) + var nodes []interface{} + err = chromedp.Run(ctx, chromedp.Evaluate(fmt.Sprintf(`(function() { + var nodes = []; + var elements = document.evaluate('%s', document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); + for (var i = 0; i < elements.snapshotLength; i++) { + nodes.push(elements.snapshotItem(i).textContent.trim()); + } + return nodes; + })()`, xpathExpression), &nodes)) + + if err != nil { + fmt.Println(err) + continue + } - antonyms := []string{} + // Convert interface{} slice to []string + for _, node := range nodes { + syns = append(syns, node.(string)) + } - // find antonyms - container.Find("#antonyms ul").Find("li").Each(func(i int, s *goquery.Selection) { - // fmt.Println(s.Find("a").Text()) - // check string - an := strings.TrimSpace(strings.ReplaceAll(s.Find("a").Text(), "\n", " ")) + fmt.Println(syns) - if len(an) > 0 { - antonyms = append(antonyms, an) } + currentPos.Syns = append(currentPos.Syns, syns...) + finalResult.Synonyms = append(finalResult.Synonyms, currentPos) - }) + } - finalResult.Antonyms = antonyms - // fmt.Println(antonyms) + if len(finalResult.Synonyms) > 0 { + finalResult.Antonyms = append(finalResult.Antonyms, "") + } - return finalResult, nil + return finalResult, err }