diff --git a/app.go b/app.go index 513f777..dce96a1 100644 --- a/app.go +++ b/app.go @@ -25,6 +25,10 @@ func checkIndex(index int) int { return index } +// RYM is the website that requires more configuration (cookies, credits scraping, etc...) +// However, we still make similar operations for both of the two websites: search an artist, +// select an artist, select an album, get album data. The similarity of these operation is what led to +// implement the scraper.Scraper interface. func app(s scraper.Scraper) { data := scraper.ScrapeData(s.SearchBand) index := -1 @@ -35,12 +39,14 @@ func app(s scraper.Scraper) { } index = checkIndex(index) s.SetLink(data.Links[index]) + // Scrape the albums of an artist data = scraper.ScrapeData(s.AlbumList) for true { cli.CallClear() cli.PrintMap(s.StyleColor(), data.Metadata) index = checkIndex(cli.PrintTable(data.Rows, data.Columns.Title, data.Columns.Width)) s.SetLink(data.Links[index]) + // Scrape albm data albumData := scraper.ScrapeData(s.Album) cli.CallClear() if albumData.Image != nil { @@ -108,7 +114,7 @@ func app(s scraper.Scraper) { s.SetLink(similData.Links[similIndex]) data = scraper.ScrapeData(s.AlbumList) goingBack = true - } else { // get back to current artist and do nothing + } else { // similIndex is the "Go back" option. Get back to current artist and do nothing s.SetLink(data.Links[index]) } } @@ -122,6 +128,7 @@ func app(s scraper.Scraper) { func main() { website := flag.String("website", "", "Desired Website ('metallum' or 'rym')") rymCredits := flag.Bool("credits", false, "Display RYM credits") + expand := flag.Bool("expand", false, "Expand RYM albums") flag.Parse() if len(flag.Args()) == 0 { os.Exit(1) @@ -146,6 +153,7 @@ func main() { r := &scraper.RateYourMusic{} r.Link = search r.GetCredits = *rymCredits + r.Expand = *expand config, _ := scraper.ReadUserConfiguration(configFilePath) r.Delay = config.Delay if config.Authenticate { diff --git a/scraper/metallum.go b/scraper/metallum.go index c4959b7..b7d8462 100644 --- a/scraper/metallum.go +++ b/scraper/metallum.go @@ -39,6 +39,10 @@ type Metallum struct { Link string } +// Metadata contains info like country of origin for band page and label for albums. +// For reference inspect: +// https://www.metal-archives.com/bands/Emperor/30 +// https://www.metal-archives.com/albums/Emperor/Anthems_to_the_Welkin_at_Dusk/92 func getMetadata(h *colly.HTMLElement, metadata *orderedmap.OrderedMap[string, string]) { keys, values := []string{}, []string{} h.ForEach("dt", func(_ int, h *colly.HTMLElement) { @@ -52,6 +56,8 @@ func getMetadata(h *colly.HTMLElement, metadata *orderedmap.OrderedMap[string, s } } +// Metallum search page renders the result of a query from a JSON payload. +// https://www.metal-archives.com/search?searchString=emperor&type=band_name func (m *Metallum) SearchBand(data *ScrapedData) ([]int, []string) { c := colly.NewCollector() data.Links = make([]string, 0) @@ -61,8 +67,9 @@ func (m *Metallum) SearchBand(data *ScrapedData) ([]int, []string) { if err := json.Unmarshal(r.Body, &response); err != nil { fmt.Println("Can not unmarshal JSON") } - for _, el := range response.AaData { + // Search results are contained in the first element of the JSON array as a HTML string. + // We parse it and get the data. doc, err := goquery.NewDocumentFromReader(strings.NewReader(el[0])) if err != nil { fmt.Println("Error on response") @@ -80,15 +87,17 @@ func (m *Metallum) SearchBand(data *ScrapedData) ([]int, []string) { return mBandColWidths[:], mAlbumColTitles[:] } +// https://www.metal-archives.com/bands/Emperor/30 func (m *Metallum) AlbumList(data *ScrapedData) ([]int, []string) { c := colly.NewCollector() data.Links = make([]string, 0) data.Metadata = orderedmap.New[string, string]() + // Get link to table with all albums c.OnHTML("#band_disco a[href*='all']", func(e *colly.HTMLElement) { e.Request.Visit(e.Attr("href")) }) - + // Scrape the table c.OnHTML("table.display.discog tbody tr", func(h *colly.HTMLElement) { var row [4]string h.ForEach(".album,.demo,.other,td a[href]", func(i int, h *colly.HTMLElement) { @@ -107,6 +116,7 @@ func (m *Metallum) AlbumList(data *ScrapedData) ([]int, []string) { return mAlbumlistColWidths[:], mAlbumlistColTitles[:] } +// https://www.metal-archives.com/albums/Emperor/Anthems_to_the_Welkin_at_Dusk/92 func (m *Metallum) Album(data *ScrapedData) ([]int, []string) { c := colly.NewCollector() data.Links = make([]string, 0) @@ -119,16 +129,14 @@ func (m *Metallum) Album(data *ScrapedData) ([]int, []string) { }) data.Rows = append(data.Rows, row[:]) }) - + // Get band id (useful if you want to check similar bands later) c.OnHTML("h2.band_name > a", func(h *colly.HTMLElement) { data.Metadata.Set("ID", path.Base(h.Attr("href"))) }) - c.OnHTML("a#cover.image", func(h *colly.HTMLElement) { image_src := h.ChildAttr("img", "src") h.Request.Visit(image_src) }) - c.OnResponse(func(r *colly.Response) { if r.Headers.Get("content-type") == "image/jpeg" { var err error @@ -138,7 +146,6 @@ func (m *Metallum) Album(data *ScrapedData) ([]int, []string) { } } }) - c.OnHTML("dl.float_right,dl.float_left", func(h *colly.HTMLElement) { getMetadata(h, data.Metadata) }) @@ -155,6 +162,7 @@ func (m *Metallum) SetLink(link string) { m.Link = link } +// https://www.metal-archives.com/albums/Emperor/Anthems_to_the_Welkin_at_Dusk/92 func (m *Metallum) ReviewsList(data *ScrapedData) ([]int, []string) { c := colly.NewCollector() data.Links = make([]string, 0) @@ -172,7 +180,6 @@ func (m *Metallum) ReviewsList(data *ScrapedData) ([]int, []string) { }) data.Rows = append(data.Rows, row[:]) }) - c.OnHTML("div.reviewBox", func(h *colly.HTMLElement) { review := h.ChildText("h3.reviewTitle") + "\n" review += h.ChildText("div:not([attr_all])") + "\n" @@ -184,6 +191,7 @@ func (m *Metallum) ReviewsList(data *ScrapedData) ([]int, []string) { return mReviewColWidths[:], mReviewColTitles[:] } +// https://www.metal-archives.com/albums/Emperor/Anthems_to_the_Welkin_at_Dusk/92 func (m *Metallum) Credits() *orderedmap.OrderedMap[string, string] { c := colly.NewCollector() credits := orderedmap.New[string, string]() @@ -212,8 +220,8 @@ func (m *Metallum) similarArtists(data *ScrapedData) ([]int, []string) { }) data.Rows = append(data.Rows, row[:]) }) - - c.OnScraped(func(_ *colly.Response) { // This makes len(data.Rown) = len(data.Links) + 1 (see app.go) + // This makes len(data.Rows) = len(data.Links) + 1 (see app.go) + c.OnScraped(func(_ *colly.Response) { data.Rows = append(data.Rows, []string{"Go back to choices", "", "", ""}) }) diff --git a/scraper/parser.go b/scraper/parser.go index 0e23525..d6de188 100644 --- a/scraper/parser.go +++ b/scraper/parser.go @@ -38,13 +38,11 @@ func credentials() (string, string, error) { if err != nil { return "", "", err } - fmt.Print("Enter Password: ") bytePassword, err := term.ReadPassword(int(syscall.Stdin)) if err != nil { return "", "", err } - password := string(bytePassword) return strings.TrimSpace(username), strings.TrimSpace(password), nil } @@ -84,7 +82,6 @@ func SaveCookie(cookies map[string]string, path string) { panic(err) } defer f.Close() - as_json, err := json.MarshalIndent(cookies, "", "\t") if err != nil { panic(err) diff --git a/scraper/rym.go b/scraper/rym.go index a4155df..3634977 100644 --- a/scraper/rym.go +++ b/scraper/rym.go @@ -7,6 +7,7 @@ import ( _ "image/jpeg" _ "image/png" "net/url" + "slices" "strings" "time" @@ -40,19 +41,24 @@ type RateYourMusic struct { Link string Cookies map[string]string GetCredits bool + Expand bool } -type albumTable struct { - query string +type AlbumTable struct { section string + query string + t rune // see AlbumList. t defines the type of the album section ('s' for albums, 'e' for EPs, etc...) + rows [][]string + links []string } type albumQuery struct { tableQuery string - albumTables []albumTable + albumTables []AlbumTable hasBio bool } +// RYM requires an async crawler with delay limitation. Otherwise your IP will be banned. func createCrawler(delay int, cookies map[string]string) *colly.Collector { c := colly.NewCollector(colly.Async(true), colly.UserAgent(USERAGENT)) if delay > 0 { @@ -78,39 +84,25 @@ func createCookieHeader(cookies map[string]string) string { return strings.Join(cookieString, "; ") } -func getAlbumListDiscography(c *colly.Collector, data *ScrapedData, query albumQuery) { - c.OnHTML("div#column_container_right div.section_artist_image > a > div", func(h *colly.HTMLElement) { - data.Metadata.Set("Top Album", h.Text) - }) - if query.hasBio { - c.OnHTML( - "div#column_container_right div.section_artist_biography > span.rendered_text", - func(h *colly.HTMLElement) { - data.Metadata.Set("Biography", h.Text) - }) - } - - c.OnHTML(query.tableQuery, func(h *colly.HTMLElement) { - for _, albumTable := range query.albumTables { - h.ForEach(albumTable.query, func(_ int, h *colly.HTMLElement) { - rating := h.ChildText("div.disco_expandcat span.disco_cat_inner") - title := h.ChildText("div.disco_info a.album") - year := h.ChildText("div.disco_info span[class*='disco_year']") - reviews := h.ChildText("div.disco_reviews") - ratings := h.ChildText("div.disco_ratings") - average := h.ChildText("div.disco_avg_rating") - recommended := "" - if h.ChildAttr("div.disco_info b.disco_mainline_recommended", "title") == "Recommended" { - recommended = "" - } - row := []string{recommended, title, year, reviews, ratings, average, albumTable.section, rating} - data.Rows = append(data.Rows, row) - data.Links = append(data.Links, DOMAIN+h.ChildAttr("div.disco_info > a", "href")) - }) +func extractAlbumData(h *colly.HTMLElement, query string, section string, rows *[][]string, links *[]string) { + h.ForEach(query, func(_ int, h *colly.HTMLElement) { + rating := h.ChildText("div.disco_expandcat span.disco_cat_inner") + title := h.ChildText("div.disco_info a.album") + year := h.ChildText("div.disco_info span[class*='disco_year']") + reviews := h.ChildText("div.disco_reviews") + ratings := h.ChildText("div.disco_ratings") + average := h.ChildText("div.disco_avg_rating") + recommended := "" + if h.ChildAttr("div.disco_info b.disco_mainline_recommended", "title") == "Recommended" { + recommended = "" } + row := []string{recommended, title, year, reviews, ratings, average, section, rating} + *rows = append(*rows, row) + *links = append(*links, DOMAIN+h.ChildAttr("div.disco_info > a", "href")) }) } +// https://rateyourmusic.com/search?searchterm=velvet%20underground&searchtype=a func (r *RateYourMusic) SearchBand(data *ScrapedData) ([]int, []string) { data.Links = make([]string, 0) c := createCrawler(r.Delay, r.Cookies) @@ -132,6 +124,34 @@ func (r *RateYourMusic) SearchBand(data *ScrapedData) ([]int, []string) { return rBandColWidths[:], rBandColTitles[:] } +// In both cases, we are scraping table containing album data. However, the Jquery strings to retrieved the data +// changes. +// AlbumTable is a struct for extracting album data from a particular section (Album, EP, credit...) +// tableQuery defines the wrapper (also a table) of all the tables of a page. +// hasBio is just to decide if we want to look for the artist's bio (which exists only in the main page). +var credits albumQuery = albumQuery{ + albumTables: []AlbumTable{ + {"Credits", "div.disco_search_results > div.disco_release", 'c', make([][]string, 0), make([]string, 0)}, + }, + tableQuery: "div#column_container_left div.release_credits", + hasBio: false, +} + +var mainPage albumQuery = albumQuery{ + albumTables: []AlbumTable{ + {"Album", "div#disco_type_s > div.disco_release", 's', make([][]string, 0), make([]string, 0)}, + {"Live Album", "div#disco_type_l > div.disco_release", 'l', make([][]string, 0), make([]string, 0)}, + {"EP", "div#disco_type_e > div.disco_release", 'e', make([][]string, 0), make([]string, 0)}, + {"Appears On", "div#disco_type_a > div.disco_release", 'a', make([][]string, 0), make([]string, 0)}, + {"Compilation", "div#disco_type_c > div.disco_release", 'c', make([][]string, 0), make([]string, 0)}, + }, + tableQuery: "div#column_container_left div#discography", + hasBio: true, +} + +// https://rateyourmusic.com/artist/the-velvet-underground +// https://rateyourmusic.com/artist/the-velvet-underground/credits/ +// r.Expand does what pressing the "showing all" button does. It is applied to all the main artist page section. func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { var query albumQuery var visitLink string @@ -139,33 +159,84 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) { data.Metadata = orderedmap.New[string, string]() if r.GetCredits { - query = albumQuery{ - albumTables: []albumTable{{query: "div.disco_search_results > div.disco_release", section: "Credits"}}, - tableQuery: "div#column_container_left div.release_credits", - hasBio: false, - } + query = credits visitLink = r.Link + "/credits" } else { - query = albumQuery{ - albumTables: []albumTable{ - {query: "div#disco_type_s > div.disco_release", section: "Album"}, - {query: "div#disco_type_l > div.disco_release", section: "Live Album"}, - {query: "div#disco_type_e > div.disco_release", section: "EP"}, - {query: "div#disco_type_a > div.disco_release", section: "Appears On"}, - {query: "div#disco_type_c > div.disco_release", section: "Compilation"}, - }, - tableQuery: "div#column_container_left div#discography", - hasBio: true, - } + query = mainPage visitLink = r.Link } c := createCrawler(r.Delay, r.Cookies) - getAlbumListDiscography(c, data, query) + c.OnHTML("div#column_container_right div.section_artist_image > a > div", func(h *colly.HTMLElement) { + data.Metadata.Set("Top Album", h.Text) + }) + if query.hasBio { + c.OnHTML( + "div#column_container_right div.section_artist_biography > span.rendered_text", + func(h *colly.HTMLElement) { + data.Metadata.Set("Biography", h.Text) + }) + } + if !r.Expand || r.GetCredits { + c.OnHTML(query.tableQuery, func(h *colly.HTMLElement) { + for i := range query.albumTables { + albumTable := &query.albumTables[i] + extractAlbumData(h, albumTable.query, albumTable.section, &albumTable.rows, &albumTable.links) + } + }) + } else { + expandForm := map[string][]byte{ + "sort": []byte("release_date.a,title.a"), + "show_appearances": []byte("false"), + "action": []byte("ExpandDiscographySection"), + "rym_ajax_req": []byte("1"), + } + if token, err := url.PathUnescape(r.Cookies["ulv"]); err == nil { + expandForm["request_token"] = []byte(token) + } + c.OnHTML("div.section_artist_name input.rym_shortcut", func(h *colly.HTMLElement) { + artistId := h.Attr("value") + expandForm["artist_id"] = []byte(artistId[7 : len(artistId)-1]) + for _, albumTable := range query.albumTables { + expandForm["type"] = []byte(string(albumTable.t)) + h.Request.PostMultipart("https://rateyourmusic.com/httprequest/ExpandDiscographySection", expandForm) + } + }) + // Body of the response is a Javascript function returning a object containing the expanded HTML. We parse + // the function, and get the substring that contains only the HTML. The HTML is parsed and data is extracted. + c.OnResponse(func(r *colly.Response) { + if r.Headers.Get("content-type") == "application/javascript; charset=utf-8" { + body := string(r.Body) + newHTML := body[strings.Index(body, " tbody > tr", func(h *colly.HTMLElement) { key := h.ChildText("th") value := strings.Join(strings.Fields(strings.ReplaceAll(h.ChildText("td"), "\n", "")), " ") @@ -196,7 +265,6 @@ func (r *RateYourMusic) Album(data *ScrapedData) ([]int, []string) { albumId := h.Attr("value") data.Metadata.Set("ID", albumId[6:len(albumId)-1]) }) - c.OnHTML("div#column_container_left div.section_tracklisting ul#tracks", func(h *colly.HTMLElement) { h.ForEach("li.track", func(_ int, h *colly.HTMLElement) { if len(h.ChildText("span.tracklist_total")) > 0 { @@ -210,7 +278,6 @@ func (r *RateYourMusic) Album(data *ScrapedData) ([]int, []string) { } }) }) - c.Visit(r.Link) c.Wait() return rAlbumColWidths[:], rAlbumColTitles[:] @@ -224,6 +291,8 @@ func (r *RateYourMusic) SetLink(link string) { r.Link = link } +// https://rateyourmusic.com/release/album/the-velvet-underground-nico/the-velvet-underground-and-nico/reviews/1/ +// Recursively scrape all reviews (may generate problems for very popular albums) func (r *RateYourMusic) ReviewsList(data *ScrapedData) ([]int, []string) { c := createCrawler(r.Delay, r.Cookies) data.Links = make([]string, 0) @@ -231,14 +300,12 @@ func (r *RateYourMusic) ReviewsList(data *ScrapedData) ([]int, []string) { c.OnHTML("span.navspan a.navlinknext", func(h *colly.HTMLElement) { h.Request.Visit(h.Attr("href")) }) - c.OnHTML("div.review > div.review_header ", func(h *colly.HTMLElement) { user := h.ChildText("span.review_user") date := h.ChildText("span.review_date") rating := strings.Split(h.ChildAttr("span.review_rating > img", "alt"), " ")[0] data.Rows = append(data.Rows, []string{user, date, rating}) }) - c.OnHTML("div.review > div.review_body ", func(h *colly.HTMLElement) { data.Links = append(data.Links, h.ChildText("span.rendered_text")) }) @@ -280,6 +347,7 @@ var loginForm = map[string][]byte{ "action": []byte("Login"), } +// For reference, inspect https://rateyourmusic.com/account/login func (r *RateYourMusic) Login() { user, password, err := credentials() if err != nil { @@ -294,7 +362,6 @@ func (r *RateYourMusic) Login() { c.OnError(func(_ *colly.Response, err error) { fmt.Println("Something went wrong:", err) }) - c.OnResponse(func(response *colly.Response) { cookies := response.Headers.Values("Set-Cookie") for _, cookieStr := range cookies { @@ -302,7 +369,6 @@ func (r *RateYourMusic) Login() { r.Cookies[cookie[0]] = cookie[1] } }) - c.PostMultipart(LOGIN, loginForm) c.Wait() } @@ -325,7 +391,6 @@ func (r *RateYourMusic) sendRating(rating string, id string) { c.OnResponse(func(r *colly.Response) { fmt.Println(r.StatusCode, "Vote has been uploaded.") }) - c.OnError(func(_ *colly.Response, err error) { fmt.Println("Something went wrong:", err) })