From 6fc56044be432fd258d04f9c9efeff9e186e1c71 Mon Sep 17 00:00:00 2001 From: mBaratta96 Date: Sun, 13 Aug 2023 11:42:24 +0200 Subject: [PATCH 1/4] get exp albums --- app.go | 6 ++++-- scraper/rym.go | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/app.go b/app.go index 513f777..05cc05f 100644 --- a/app.go +++ b/app.go @@ -37,7 +37,7 @@ func app(s scraper.Scraper) { s.SetLink(data.Links[index]) data = scraper.ScrapeData(s.AlbumList) for true { - cli.CallClear() + // cli.CallClear() cli.PrintMap(s.StyleColor(), data.Metadata) index = checkIndex(cli.PrintTable(data.Rows, data.Columns.Title, data.Columns.Width)) s.SetLink(data.Links[index]) @@ -108,7 +108,7 @@ func app(s scraper.Scraper) { s.SetLink(similData.Links[similIndex]) data = scraper.ScrapeData(s.AlbumList) goingBack = true - } else { // get back to current artist and do nothing + } else { // similIndex is the "Go back" option. Get back to current artist and do nothing s.SetLink(data.Links[index]) } } @@ -122,6 +122,7 @@ func app(s scraper.Scraper) { func main() { website := flag.String("website", "", "Desired Website ('metallum' or 'rym')") rymCredits := flag.Bool("credits", false, "Display RYM credits") + expand := flag.Bool("expand", false, "Expand RYM albums") flag.Parse() if len(flag.Args()) == 0 { os.Exit(1) @@ -146,6 +147,7 @@ func main() { r := &scraper.RateYourMusic{} r.Link = search r.GetCredits = *rymCredits + r.Expand = *expand config, _ := scraper.ReadUserConfiguration(configFilePath) r.Delay = config.Delay if config.Authenticate { diff --git a/scraper/rym.go b/scraper/rym.go index a4155df..0aa064f 100644 --- a/scraper/rym.go +++ b/scraper/rym.go @@ -40,6 +40,7 @@ type RateYourMusic struct { Link string Cookies map[string]string GetCredits bool + Expand bool } type albumTable struct { @@ -161,6 +162,54 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { } c := createCrawler(r.Delay, r.Cookies) getAlbumListDiscography(c, data, query) + expandForm := map[string][]byte{ + "sort": []byte("release_date.a,title.a"), + "show_appearances": []byte("false"), + "type": []byte("s"), + "action": []byte("ExpandDiscographySection"), + "rym_ajax_req": []byte("1"), + } + if r.Expand { + if token, err := url.PathUnescape(r.Cookies["ulv"]); err == nil { + expandForm["request_token"] = []byte(token) + } + c.OnHTML("div.section_artist_name input.rym_shortcut", func(h *colly.HTMLElement) { + artistId := h.Attr("value") + expandForm["artist_id"] = []byte(artistId[7 : len(artistId)-1]) + h.Request.PostMultipart("https://rateyourmusic.com/httprequest/ExpandDiscographySection", expandForm) + }) + c.OnResponse(func(r *colly.Response) { + if r.Headers.Get("content-type") == "application/javascript; charset=utf-8" { + body := string(r.Body) + newHTML := body[strings.Index(body, " a", "href")) + }) + } + }) + c.OnError(func(r *colly.Response, err error) { + fmt.Println(err) + }) + } c.Visit(visitLink) c.Wait() return rAlbumlistColWidths[:], rAlbumlistColTitles[:] From 90618998e2814b939085c35c36dc8e352ecab658 Mon Sep 17 00:00:00 2001 From: mBaratta96 Date: Sun, 13 Aug 2023 12:08:08 +0200 Subject: [PATCH 2/4] expanded --- scraper/rym.go | 62 ++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/scraper/rym.go b/scraper/rym.go index 0aa064f..4f09879 100644 --- a/scraper/rym.go +++ b/scraper/rym.go @@ -79,6 +79,24 @@ func createCookieHeader(cookies map[string]string) string { return strings.Join(cookieString, "; ") } +func extractAlbumData(h *colly.HTMLElement, query string, section string, rows *[][]string, links *[]string) { + h.ForEach(query, func(_ int, h *colly.HTMLElement) { + rating := h.ChildText("div.disco_expandcat span.disco_cat_inner") + title := h.ChildText("div.disco_info a.album") + year := h.ChildText("div.disco_info span[class*='disco_year']") + reviews := h.ChildText("div.disco_reviews") + ratings := h.ChildText("div.disco_ratings") + average := h.ChildText("div.disco_avg_rating") + recommended := "" + if h.ChildAttr("div.disco_info b.disco_mainline_recommended", "title") == "Recommended" { + recommended = "" + } + row := []string{recommended, title, year, reviews, ratings, average, section, rating} + *rows = append(*rows, row) + *links = append(*links, DOMAIN+h.ChildAttr("div.disco_info > a", "href")) + }) +} + func getAlbumListDiscography(c *colly.Collector, data *ScrapedData, query albumQuery) { c.OnHTML("div#column_container_right div.section_artist_image > a > div", func(h *colly.HTMLElement) { data.Metadata.Set("Top Album", h.Text) @@ -90,24 +108,9 @@ func getAlbumListDiscography(c *colly.Collector, data *ScrapedData, query albumQ data.Metadata.Set("Biography", h.Text) }) } - c.OnHTML(query.tableQuery, func(h *colly.HTMLElement) { for _, albumTable := range query.albumTables { - h.ForEach(albumTable.query, func(_ int, h *colly.HTMLElement) { - rating := h.ChildText("div.disco_expandcat span.disco_cat_inner") - title := h.ChildText("div.disco_info a.album") - year := h.ChildText("div.disco_info span[class*='disco_year']") - reviews := h.ChildText("div.disco_reviews") - ratings := h.ChildText("div.disco_ratings") - average := h.ChildText("div.disco_avg_rating") - recommended := "" - if h.ChildAttr("div.disco_info b.disco_mainline_recommended", "title") == "Recommended" { - recommended = "" - } - row := []string{recommended, title, year, reviews, ratings, average, albumTable.section, rating} - data.Rows = append(data.Rows, row) - data.Links = append(data.Links, DOMAIN+h.ChildAttr("div.disco_info > a", "href")) - }) + extractAlbumData(h, albumTable.query, albumTable.section, &data.Rows, &data.Links) } }) } @@ -149,15 +152,20 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { } else { query = albumQuery{ albumTables: []albumTable{ - {query: "div#disco_type_s > div.disco_release", section: "Album"}, {query: "div#disco_type_l > div.disco_release", section: "Live Album"}, {query: "div#disco_type_e > div.disco_release", section: "EP"}, {query: "div#disco_type_a > div.disco_release", section: "Appears On"}, {query: "div#disco_type_c > div.disco_release", section: "Compilation"}, }, + tableQuery: "div#column_container_left div#discography", hasBio: true, } + if !r.Expand { + query.albumTables = append([]albumTable{ + {query: "div#disco_type_s > div.disco_release", section: "Album"}, + }, query.albumTables...) + } visitLink = r.Link } c := createCrawler(r.Delay, r.Cookies) @@ -189,21 +197,11 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { albumSelector := doc.Find("div#disco_type_s") album := colly.NewHTMLElementFromSelectionNode(r, albumSelector, albumSelector.Get(0), 0) - album.ForEach("div.disco_release", func(i int, h *colly.HTMLElement) { - rating := h.ChildText("div.disco_expandcat span.disco_cat_inner") - title := h.ChildText("div.disco_info a.album") - year := h.ChildText("div.disco_info span[class*='disco_year']") - reviews := h.ChildText("div.disco_reviews") - ratings := h.ChildText("div.disco_ratings") - average := h.ChildText("div.disco_avg_rating") - recommended := "" - if h.ChildAttr("div.disco_info b.disco_mainline_recommended", "title") == "Recommended" { - recommended = "" - } - row := []string{recommended, title, year, reviews, ratings, average, "Exp. Album", rating} - data.Rows = append(data.Rows, row) - data.Links = append(data.Links, DOMAIN+h.ChildAttr("div.disco_info > a", "href")) - }) + albumRows := make([][]string, 0) + albumLinks := make([]string, 0) + extractAlbumData(album, "div.disco_release", "Album", &albumRows, &albumLinks) + data.Rows = append(albumRows, data.Rows...) + data.Links = append(albumLinks, data.Links...) } }) c.OnError(func(r *colly.Response, err error) { From dbe18174d84510b4ba28bedaf6f9fb81a9a6ff38 Mon Sep 17 00:00:00 2001 From: mBaratta96 Date: Sun, 13 Aug 2023 14:16:50 +0200 Subject: [PATCH 3/4] ordered expanded fields --- app.go | 2 +- scraper/rym.go | 135 ++++++++++++++++++++++++++++--------------------- 2 files changed, 78 insertions(+), 59 deletions(-) diff --git a/app.go b/app.go index 05cc05f..916f5c2 100644 --- a/app.go +++ b/app.go @@ -37,7 +37,7 @@ func app(s scraper.Scraper) { s.SetLink(data.Links[index]) data = scraper.ScrapeData(s.AlbumList) for true { - // cli.CallClear() + cli.CallClear() cli.PrintMap(s.StyleColor(), data.Metadata) index = checkIndex(cli.PrintTable(data.Rows, data.Columns.Title, data.Columns.Width)) s.SetLink(data.Links[index]) diff --git a/scraper/rym.go b/scraper/rym.go index 4f09879..54c0d3c 100644 --- a/scraper/rym.go +++ b/scraper/rym.go @@ -7,6 +7,7 @@ import ( _ "image/jpeg" _ "image/png" "net/url" + "slices" "strings" "time" @@ -43,14 +44,17 @@ type RateYourMusic struct { Expand bool } -type albumTable struct { - query string +type AlbumTable struct { section string + query string + t rune + rows [][]string + links []string } type albumQuery struct { tableQuery string - albumTables []albumTable + albumTables []AlbumTable hasBio bool } @@ -97,24 +101,6 @@ func extractAlbumData(h *colly.HTMLElement, query string, section string, rows * }) } -func getAlbumListDiscography(c *colly.Collector, data *ScrapedData, query albumQuery) { - c.OnHTML("div#column_container_right div.section_artist_image > a > div", func(h *colly.HTMLElement) { - data.Metadata.Set("Top Album", h.Text) - }) - if query.hasBio { - c.OnHTML( - "div#column_container_right div.section_artist_biography > span.rendered_text", - func(h *colly.HTMLElement) { - data.Metadata.Set("Biography", h.Text) - }) - } - c.OnHTML(query.tableQuery, func(h *colly.HTMLElement) { - for _, albumTable := range query.albumTables { - extractAlbumData(h, albumTable.query, albumTable.section, &data.Rows, &data.Links) - } - }) -} - func (r *RateYourMusic) SearchBand(data *ScrapedData) ([]int, []string) { data.Links = make([]string, 0) c := createCrawler(r.Delay, r.Cookies) @@ -136,6 +122,34 @@ func (r *RateYourMusic) SearchBand(data *ScrapedData) ([]int, []string) { return rBandColWidths[:], rBandColTitles[:] } +var credits albumQuery = albumQuery{ + albumTables: []AlbumTable{ + {"Credits", "div.disco_search_results > div.disco_release", 'c', make([][]string, 0), make([]string, 0)}, + }, + tableQuery: "div#column_container_left div.release_credits", + hasBio: false, +} + +var mainPage albumQuery = albumQuery{ + albumTables: []AlbumTable{ + {"Album", "div#disco_type_s > div.disco_release", 's', make([][]string, 0), make([]string, 0)}, + {"Live Album", "div#disco_type_l > div.disco_release", 'l', make([][]string, 0), make([]string, 0)}, + {"EP", "div#disco_type_e > div.disco_release", 'e', make([][]string, 0), make([]string, 0)}, + {"Appears On", "div#disco_type_a > div.disco_release", 'a', make([][]string, 0), make([]string, 0)}, + {"Compilation", "div#disco_type_c > div.disco_release", 'c', make([][]string, 0), make([]string, 0)}, + }, + tableQuery: "div#column_container_left div#discography", + hasBio: true, +} + +var typeToSection map[rune]string = map[rune]string{ + 's': "Album", + 'l': "Live Album", + 'e': "EP", + 'a': "Appears On", + 'c': "Compilation", +} + func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { var query albumQuery var visitLink string @@ -143,48 +157,47 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { data.Metadata = orderedmap.New[string, string]() if r.GetCredits { - query = albumQuery{ - albumTables: []albumTable{{query: "div.disco_search_results > div.disco_release", section: "Credits"}}, - tableQuery: "div#column_container_left div.release_credits", - hasBio: false, - } + query = credits visitLink = r.Link + "/credits" } else { - query = albumQuery{ - albumTables: []albumTable{ - {query: "div#disco_type_l > div.disco_release", section: "Live Album"}, - {query: "div#disco_type_e > div.disco_release", section: "EP"}, - {query: "div#disco_type_a > div.disco_release", section: "Appears On"}, - {query: "div#disco_type_c > div.disco_release", section: "Compilation"}, - }, - - tableQuery: "div#column_container_left div#discography", - hasBio: true, - } - if !r.Expand { - query.albumTables = append([]albumTable{ - {query: "div#disco_type_s > div.disco_release", section: "Album"}, - }, query.albumTables...) - } + query = mainPage visitLink = r.Link } c := createCrawler(r.Delay, r.Cookies) - getAlbumListDiscography(c, data, query) - expandForm := map[string][]byte{ - "sort": []byte("release_date.a,title.a"), - "show_appearances": []byte("false"), - "type": []byte("s"), - "action": []byte("ExpandDiscographySection"), - "rym_ajax_req": []byte("1"), + c.OnHTML("div#column_container_right div.section_artist_image > a > div", func(h *colly.HTMLElement) { + data.Metadata.Set("Top Album", h.Text) + }) + if query.hasBio { + c.OnHTML( + "div#column_container_right div.section_artist_biography > span.rendered_text", + func(h *colly.HTMLElement) { + data.Metadata.Set("Biography", h.Text) + }) } - if r.Expand { + + if !r.Expand || r.GetCredits { + c.OnHTML(query.tableQuery, func(h *colly.HTMLElement) { + for _, albumTable := range query.albumTables { + extractAlbumData(h, albumTable.query, albumTable.section, &albumTable.rows, &albumTable.links) + } + }) + } else { + expandForm := map[string][]byte{ + "sort": []byte("release_date.a,title.a"), + "show_appearances": []byte("false"), + "action": []byte("ExpandDiscographySection"), + "rym_ajax_req": []byte("1"), + } if token, err := url.PathUnescape(r.Cookies["ulv"]); err == nil { expandForm["request_token"] = []byte(token) } c.OnHTML("div.section_artist_name input.rym_shortcut", func(h *colly.HTMLElement) { artistId := h.Attr("value") expandForm["artist_id"] = []byte(artistId[7 : len(artistId)-1]) - h.Request.PostMultipart("https://rateyourmusic.com/httprequest/ExpandDiscographySection", expandForm) + for _, albumTable := range query.albumTables { + expandForm["type"] = []byte(string(albumTable.t)) + h.Request.PostMultipart("https://rateyourmusic.com/httprequest/ExpandDiscographySection", expandForm) + } }) c.OnResponse(func(r *colly.Response) { if r.Headers.Get("content-type") == "application/javascript; charset=utf-8" { @@ -194,22 +207,28 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { if err != nil { fmt.Println("Error on response") } - - albumSelector := doc.Find("div#disco_type_s") + albumType := body[strings.Index(body, "('")+2] // string after javascript function name + albumTableIndex := slices.IndexFunc(query.albumTables, func(table AlbumTable) bool { + return table.t == rune(albumType) + }) + albumTable := &query.albumTables[albumTableIndex] + albumSelector := doc.Find(fmt.Sprintf("div#disco_type_%c", albumType)) album := colly.NewHTMLElementFromSelectionNode(r, albumSelector, albumSelector.Get(0), 0) - albumRows := make([][]string, 0) - albumLinks := make([]string, 0) - extractAlbumData(album, "div.disco_release", "Album", &albumRows, &albumLinks) - data.Rows = append(albumRows, data.Rows...) - data.Links = append(albumLinks, data.Links...) + extractAlbumData(album, "div.disco_release", albumTable.section, &albumTable.rows, &albumTable.links) } }) c.OnError(func(r *colly.Response, err error) { fmt.Println(err) }) + } c.Visit(visitLink) c.Wait() + for _, albumTable := range query.albumTables { + fmt.Println(albumTable.links) + data.Rows = append(data.Rows, albumTable.rows...) + data.Links = append(data.Links, albumTable.links...) + } return rAlbumlistColWidths[:], rAlbumlistColTitles[:] } From 43a8d898f501f0d21b1cfa965f8c55ce6f13b6da Mon Sep 17 00:00:00 2001 From: mBaratta96 Date: Tue, 15 Aug 2023 11:19:57 +0200 Subject: [PATCH 4/4] some comments --- app.go | 6 ++++++ scraper/metallum.go | 26 +++++++++++++++++--------- scraper/parser.go | 3 --- scraper/rym.go | 41 ++++++++++++++++++++--------------------- 4 files changed, 43 insertions(+), 33 deletions(-) diff --git a/app.go b/app.go index 916f5c2..dce96a1 100644 --- a/app.go +++ b/app.go @@ -25,6 +25,10 @@ func checkIndex(index int) int { return index } +// RYM is the website that requires more configuration (cookies, credits scraping, etc...) +// However, we still make similar operations for both of the two websites: search an artist, +// select an artist, select an album, get album data. The similarity of these operation is what led to +// implement the scraper.Scraper interface. func app(s scraper.Scraper) { data := scraper.ScrapeData(s.SearchBand) index := -1 @@ -35,12 +39,14 @@ func app(s scraper.Scraper) { } index = checkIndex(index) s.SetLink(data.Links[index]) + // Scrape the albums of an artist data = scraper.ScrapeData(s.AlbumList) for true { cli.CallClear() cli.PrintMap(s.StyleColor(), data.Metadata) index = checkIndex(cli.PrintTable(data.Rows, data.Columns.Title, data.Columns.Width)) s.SetLink(data.Links[index]) + // Scrape albm data albumData := scraper.ScrapeData(s.Album) cli.CallClear() if albumData.Image != nil { diff --git a/scraper/metallum.go b/scraper/metallum.go index c4959b7..b7d8462 100644 --- a/scraper/metallum.go +++ b/scraper/metallum.go @@ -39,6 +39,10 @@ type Metallum struct { Link string } +// Metadata contains info like country of origin for band page and label for albums. +// For reference inspect: +// https://www.metal-archives.com/bands/Emperor/30 +// https://www.metal-archives.com/albums/Emperor/Anthems_to_the_Welkin_at_Dusk/92 func getMetadata(h *colly.HTMLElement, metadata *orderedmap.OrderedMap[string, string]) { keys, values := []string{}, []string{} h.ForEach("dt", func(_ int, h *colly.HTMLElement) { @@ -52,6 +56,8 @@ func getMetadata(h *colly.HTMLElement, metadata *orderedmap.OrderedMap[string, s } } +// Metallum search page renders the result of a query from a JSON payload. +// https://www.metal-archives.com/search?searchString=emperor&type=band_name func (m *Metallum) SearchBand(data *ScrapedData) ([]int, []string) { c := colly.NewCollector() data.Links = make([]string, 0) @@ -61,8 +67,9 @@ func (m *Metallum) SearchBand(data *ScrapedData) ([]int, []string) { if err := json.Unmarshal(r.Body, &response); err != nil { fmt.Println("Can not unmarshal JSON") } - for _, el := range response.AaData { + // Search results are contained in the first element of the JSON array as a HTML string. + // We parse it and get the data. doc, err := goquery.NewDocumentFromReader(strings.NewReader(el[0])) if err != nil { fmt.Println("Error on response") @@ -80,15 +87,17 @@ func (m *Metallum) SearchBand(data *ScrapedData) ([]int, []string) { return mBandColWidths[:], mAlbumColTitles[:] } +// https://www.metal-archives.com/bands/Emperor/30 func (m *Metallum) AlbumList(data *ScrapedData) ([]int, []string) { c := colly.NewCollector() data.Links = make([]string, 0) data.Metadata = orderedmap.New[string, string]() + // Get link to table with all albums c.OnHTML("#band_disco a[href*='all']", func(e *colly.HTMLElement) { e.Request.Visit(e.Attr("href")) }) - + // Scrape the table c.OnHTML("table.display.discog tbody tr", func(h *colly.HTMLElement) { var row [4]string h.ForEach(".album,.demo,.other,td a[href]", func(i int, h *colly.HTMLElement) { @@ -107,6 +116,7 @@ func (m *Metallum) AlbumList(data *ScrapedData) ([]int, []string) { return mAlbumlistColWidths[:], mAlbumlistColTitles[:] } +// https://www.metal-archives.com/albums/Emperor/Anthems_to_the_Welkin_at_Dusk/92 func (m *Metallum) Album(data *ScrapedData) ([]int, []string) { c := colly.NewCollector() data.Links = make([]string, 0) @@ -119,16 +129,14 @@ func (m *Metallum) Album(data *ScrapedData) ([]int, []string) { }) data.Rows = append(data.Rows, row[:]) }) - + // Get band id (useful if you want to check similar bands later) c.OnHTML("h2.band_name > a", func(h *colly.HTMLElement) { data.Metadata.Set("ID", path.Base(h.Attr("href"))) }) - c.OnHTML("a#cover.image", func(h *colly.HTMLElement) { image_src := h.ChildAttr("img", "src") h.Request.Visit(image_src) }) - c.OnResponse(func(r *colly.Response) { if r.Headers.Get("content-type") == "image/jpeg" { var err error @@ -138,7 +146,6 @@ func (m *Metallum) Album(data *ScrapedData) ([]int, []string) { } } }) - c.OnHTML("dl.float_right,dl.float_left", func(h *colly.HTMLElement) { getMetadata(h, data.Metadata) }) @@ -155,6 +162,7 @@ func (m *Metallum) SetLink(link string) { m.Link = link } +// https://www.metal-archives.com/albums/Emperor/Anthems_to_the_Welkin_at_Dusk/92 func (m *Metallum) ReviewsList(data *ScrapedData) ([]int, []string) { c := colly.NewCollector() data.Links = make([]string, 0) @@ -172,7 +180,6 @@ func (m *Metallum) ReviewsList(data *ScrapedData) ([]int, []string) { }) data.Rows = append(data.Rows, row[:]) }) - c.OnHTML("div.reviewBox", func(h *colly.HTMLElement) { review := h.ChildText("h3.reviewTitle") + "\n" review += h.ChildText("div:not([attr_all])") + "\n" @@ -184,6 +191,7 @@ func (m *Metallum) ReviewsList(data *ScrapedData) ([]int, []string) { return mReviewColWidths[:], mReviewColTitles[:] } +// https://www.metal-archives.com/albums/Emperor/Anthems_to_the_Welkin_at_Dusk/92 func (m *Metallum) Credits() *orderedmap.OrderedMap[string, string] { c := colly.NewCollector() credits := orderedmap.New[string, string]() @@ -212,8 +220,8 @@ func (m *Metallum) similarArtists(data *ScrapedData) ([]int, []string) { }) data.Rows = append(data.Rows, row[:]) }) - - c.OnScraped(func(_ *colly.Response) { // This makes len(data.Rown) = len(data.Links) + 1 (see app.go) + // This makes len(data.Rows) = len(data.Links) + 1 (see app.go) + c.OnScraped(func(_ *colly.Response) { data.Rows = append(data.Rows, []string{"Go back to choices", "", "", ""}) }) diff --git a/scraper/parser.go b/scraper/parser.go index 0e23525..d6de188 100644 --- a/scraper/parser.go +++ b/scraper/parser.go @@ -38,13 +38,11 @@ func credentials() (string, string, error) { if err != nil { return "", "", err } - fmt.Print("Enter Password: ") bytePassword, err := term.ReadPassword(int(syscall.Stdin)) if err != nil { return "", "", err } - password := string(bytePassword) return strings.TrimSpace(username), strings.TrimSpace(password), nil } @@ -84,7 +82,6 @@ func SaveCookie(cookies map[string]string, path string) { panic(err) } defer f.Close() - as_json, err := json.MarshalIndent(cookies, "", "\t") if err != nil { panic(err) diff --git a/scraper/rym.go b/scraper/rym.go index 54c0d3c..3634977 100644 --- a/scraper/rym.go +++ b/scraper/rym.go @@ -47,7 +47,7 @@ type RateYourMusic struct { type AlbumTable struct { section string query string - t rune + t rune // see AlbumList. t defines the type of the album section ('s' for albums, 'e' for EPs, etc...) rows [][]string links []string } @@ -58,6 +58,7 @@ type albumQuery struct { hasBio bool } +// RYM requires an async crawler with delay limitation. Otherwise your IP will be banned. func createCrawler(delay int, cookies map[string]string) *colly.Collector { c := colly.NewCollector(colly.Async(true), colly.UserAgent(USERAGENT)) if delay > 0 { @@ -101,6 +102,7 @@ func extractAlbumData(h *colly.HTMLElement, query string, section string, rows * }) } +// https://rateyourmusic.com/search?searchterm=velvet%20underground&searchtype=a func (r *RateYourMusic) SearchBand(data *ScrapedData) ([]int, []string) { data.Links = make([]string, 0) c := createCrawler(r.Delay, r.Cookies) @@ -122,6 +124,11 @@ func (r *RateYourMusic) SearchBand(data *ScrapedData) ([]int, []string) { return rBandColWidths[:], rBandColTitles[:] } +// In both cases, we are scraping table containing album data. However, the Jquery strings to retrieved the data +// changes. +// AlbumTable is a struct for extracting album data from a particular section (Album, EP, credit...) +// tableQuery defines the wrapper (also a table) of all the tables of a page. +// hasBio is just to decide if we want to look for the artist's bio (which exists only in the main page). var credits albumQuery = albumQuery{ albumTables: []AlbumTable{ {"Credits", "div.disco_search_results > div.disco_release", 'c', make([][]string, 0), make([]string, 0)}, @@ -142,14 +149,9 @@ var mainPage albumQuery = albumQuery{ hasBio: true, } -var typeToSection map[rune]string = map[rune]string{ - 's': "Album", - 'l': "Live Album", - 'e': "EP", - 'a': "Appears On", - 'c': "Compilation", -} - +// https://rateyourmusic.com/artist/the-velvet-underground +// https://rateyourmusic.com/artist/the-velvet-underground/credits/ +// r.Expand does what pressing the "showing all" button does. It is applied to all the main artist page section. func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { var query albumQuery var visitLink string @@ -174,10 +176,10 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { data.Metadata.Set("Biography", h.Text) }) } - if !r.Expand || r.GetCredits { c.OnHTML(query.tableQuery, func(h *colly.HTMLElement) { - for _, albumTable := range query.albumTables { + for i := range query.albumTables { + albumTable := &query.albumTables[i] extractAlbumData(h, albumTable.query, albumTable.section, &albumTable.rows, &albumTable.links) } }) @@ -199,6 +201,8 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { h.Request.PostMultipart("https://rateyourmusic.com/httprequest/ExpandDiscographySection", expandForm) } }) + // Body of the response is a Javascript function returning a object containing the expanded HTML. We parse + // the function, and get the substring that contains only the HTML. The HTML is parsed and data is extracted. c.OnResponse(func(r *colly.Response) { if r.Headers.Get("content-type") == "application/javascript; charset=utf-8" { body := string(r.Body) @@ -207,7 +211,7 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { if err != nil { fmt.Println("Error on response") } - albumType := body[strings.Index(body, "('")+2] // string after javascript function name + albumType := body[strings.Index(body, "('")+2] // char after javascript function header albumTableIndex := slices.IndexFunc(query.albumTables, func(table AlbumTable) bool { return table.t == rune(albumType) }) @@ -232,6 +236,7 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { return rAlbumlistColWidths[:], rAlbumlistColTitles[:] } +// https://rateyourmusic.com/release/album/the-velvet-underground-nico/the-velvet-underground-and-nico/ func (r *RateYourMusic) Album(data *ScrapedData) ([]int, []string) { c := createCrawler(r.Delay, r.Cookies) data.Metadata = orderedmap.New[string, string]() @@ -240,7 +245,6 @@ func (r *RateYourMusic) Album(data *ScrapedData) ([]int, []string) { image_url := h.ChildAttr("img", "src") h.Request.Visit("https:" + image_url) }) - c.OnResponse(func(r *colly.Response) { if r.Headers.Get("content-type") == "image/jpg" || r.Headers.Get("content-type") == "image/png" { var err error @@ -250,7 +254,6 @@ func (r *RateYourMusic) Album(data *ScrapedData) ([]int, []string) { } } }) - c.OnHTML("table.album_info > tbody > tr", func(h *colly.HTMLElement) { key := h.ChildText("th") value := strings.Join(strings.Fields(strings.ReplaceAll(h.ChildText("td"), "\n", "")), " ") @@ -262,7 +265,6 @@ func (r *RateYourMusic) Album(data *ScrapedData) ([]int, []string) { albumId := h.Attr("value") data.Metadata.Set("ID", albumId[6:len(albumId)-1]) }) - c.OnHTML("div#column_container_left div.section_tracklisting ul#tracks", func(h *colly.HTMLElement) { h.ForEach("li.track", func(_ int, h *colly.HTMLElement) { if len(h.ChildText("span.tracklist_total")) > 0 { @@ -276,7 +278,6 @@ func (r *RateYourMusic) Album(data *ScrapedData) ([]int, []string) { } }) }) - c.Visit(r.Link) c.Wait() return rAlbumColWidths[:], rAlbumColTitles[:] @@ -290,6 +291,8 @@ func (r *RateYourMusic) SetLink(link string) { r.Link = link } +// https://rateyourmusic.com/release/album/the-velvet-underground-nico/the-velvet-underground-and-nico/reviews/1/ +// Recursively scrape all reviews (may generate problems for very popular albums) func (r *RateYourMusic) ReviewsList(data *ScrapedData) ([]int, []string) { c := createCrawler(r.Delay, r.Cookies) data.Links = make([]string, 0) @@ -297,14 +300,12 @@ func (r *RateYourMusic) ReviewsList(data *ScrapedData) ([]int, []string) { c.OnHTML("span.navspan a.navlinknext", func(h *colly.HTMLElement) { h.Request.Visit(h.Attr("href")) }) - c.OnHTML("div.review > div.review_header ", func(h *colly.HTMLElement) { user := h.ChildText("span.review_user") date := h.ChildText("span.review_date") rating := strings.Split(h.ChildAttr("span.review_rating > img", "alt"), " ")[0] data.Rows = append(data.Rows, []string{user, date, rating}) }) - c.OnHTML("div.review > div.review_body ", func(h *colly.HTMLElement) { data.Links = append(data.Links, h.ChildText("span.rendered_text")) }) @@ -346,6 +347,7 @@ var loginForm = map[string][]byte{ "action": []byte("Login"), } +// For reference, inspect https://rateyourmusic.com/account/login func (r *RateYourMusic) Login() { user, password, err := credentials() if err != nil { @@ -360,7 +362,6 @@ func (r *RateYourMusic) Login() { c.OnError(func(_ *colly.Response, err error) { fmt.Println("Something went wrong:", err) }) - c.OnResponse(func(response *colly.Response) { cookies := response.Headers.Values("Set-Cookie") for _, cookieStr := range cookies { @@ -368,7 +369,6 @@ func (r *RateYourMusic) Login() { r.Cookies[cookie[0]] = cookie[1] } }) - c.PostMultipart(LOGIN, loginForm) c.Wait() } @@ -391,7 +391,6 @@ func (r *RateYourMusic) sendRating(rating string, id string) { c.OnResponse(func(r *colly.Response) { fmt.Println(r.StatusCode, "Vote has been uploaded.") }) - c.OnError(func(_ *colly.Response, err error) { fmt.Println("Something went wrong:", err) })