Skip to content
This repository has been archived by the owner on Nov 19, 2024. It is now read-only.

Commit

Permalink
some comments
Browse files Browse the repository at this point in the history
  • Loading branch information
mBaratta96 committed Aug 15, 2023
1 parent dbe1817 commit 43a8d89
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 33 deletions.
6 changes: 6 additions & 0 deletions app.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ func checkIndex(index int) int {
return index
}

// RYM is the website that requires more configuration (cookies, credits scraping, etc...)
// However, we still make similar operations for both of the two websites: search an artist,
// select an artist, select an album, get album data. The similarity of these operation is what led to
// implement the scraper.Scraper interface.
func app(s scraper.Scraper) {
data := scraper.ScrapeData(s.SearchBand)
index := -1
Expand All @@ -35,12 +39,14 @@ func app(s scraper.Scraper) {
}
index = checkIndex(index)
s.SetLink(data.Links[index])
// Scrape the albums of an artist
data = scraper.ScrapeData(s.AlbumList)
for true {
cli.CallClear()
cli.PrintMap(s.StyleColor(), data.Metadata)
index = checkIndex(cli.PrintTable(data.Rows, data.Columns.Title, data.Columns.Width))
s.SetLink(data.Links[index])
// Scrape albm data
albumData := scraper.ScrapeData(s.Album)
cli.CallClear()
if albumData.Image != nil {
Expand Down
26 changes: 17 additions & 9 deletions scraper/metallum.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ type Metallum struct {
Link string
}

// Metadata contains info like country of origin for band page and label for albums.
// For reference inspect:
// https://www.metal-archives.com/bands/Emperor/30
// https://www.metal-archives.com/albums/Emperor/Anthems_to_the_Welkin_at_Dusk/92
func getMetadata(h *colly.HTMLElement, metadata *orderedmap.OrderedMap[string, string]) {
keys, values := []string{}, []string{}
h.ForEach("dt", func(_ int, h *colly.HTMLElement) {
Expand All @@ -52,6 +56,8 @@ func getMetadata(h *colly.HTMLElement, metadata *orderedmap.OrderedMap[string, s
}
}

// Metallum search page renders the result of a query from a JSON payload.
// https://www.metal-archives.com/search?searchString=emperor&type=band_name
func (m *Metallum) SearchBand(data *ScrapedData) ([]int, []string) {
c := colly.NewCollector()
data.Links = make([]string, 0)
Expand All @@ -61,8 +67,9 @@ func (m *Metallum) SearchBand(data *ScrapedData) ([]int, []string) {
if err := json.Unmarshal(r.Body, &response); err != nil {
fmt.Println("Can not unmarshal JSON")
}

for _, el := range response.AaData {
// Search results are contained in the first element of the JSON array as a HTML string.
// We parse it and get the data.
doc, err := goquery.NewDocumentFromReader(strings.NewReader(el[0]))
if err != nil {
fmt.Println("Error on response")
Expand All @@ -80,15 +87,17 @@ func (m *Metallum) SearchBand(data *ScrapedData) ([]int, []string) {
return mBandColWidths[:], mAlbumColTitles[:]
}

// https://www.metal-archives.com/bands/Emperor/30
func (m *Metallum) AlbumList(data *ScrapedData) ([]int, []string) {
c := colly.NewCollector()
data.Links = make([]string, 0)
data.Metadata = orderedmap.New[string, string]()

// Get link to table with all albums
c.OnHTML("#band_disco a[href*='all']", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})

// Scrape the table
c.OnHTML("table.display.discog tbody tr", func(h *colly.HTMLElement) {
var row [4]string
h.ForEach(".album,.demo,.other,td a[href]", func(i int, h *colly.HTMLElement) {
Expand All @@ -107,6 +116,7 @@ func (m *Metallum) AlbumList(data *ScrapedData) ([]int, []string) {
return mAlbumlistColWidths[:], mAlbumlistColTitles[:]
}

// https://www.metal-archives.com/albums/Emperor/Anthems_to_the_Welkin_at_Dusk/92
func (m *Metallum) Album(data *ScrapedData) ([]int, []string) {
c := colly.NewCollector()
data.Links = make([]string, 0)
Expand All @@ -119,16 +129,14 @@ func (m *Metallum) Album(data *ScrapedData) ([]int, []string) {
})
data.Rows = append(data.Rows, row[:])
})

// Get band id (useful if you want to check similar bands later)
c.OnHTML("h2.band_name > a", func(h *colly.HTMLElement) {
data.Metadata.Set("ID", path.Base(h.Attr("href")))
})

c.OnHTML("a#cover.image", func(h *colly.HTMLElement) {
image_src := h.ChildAttr("img", "src")
h.Request.Visit(image_src)
})

c.OnResponse(func(r *colly.Response) {
if r.Headers.Get("content-type") == "image/jpeg" {
var err error
Expand All @@ -138,7 +146,6 @@ func (m *Metallum) Album(data *ScrapedData) ([]int, []string) {
}
}
})

c.OnHTML("dl.float_right,dl.float_left", func(h *colly.HTMLElement) {
getMetadata(h, data.Metadata)
})
Expand All @@ -155,6 +162,7 @@ func (m *Metallum) SetLink(link string) {
m.Link = link
}

// https://www.metal-archives.com/albums/Emperor/Anthems_to_the_Welkin_at_Dusk/92
func (m *Metallum) ReviewsList(data *ScrapedData) ([]int, []string) {
c := colly.NewCollector()
data.Links = make([]string, 0)
Expand All @@ -172,7 +180,6 @@ func (m *Metallum) ReviewsList(data *ScrapedData) ([]int, []string) {
})
data.Rows = append(data.Rows, row[:])
})

c.OnHTML("div.reviewBox", func(h *colly.HTMLElement) {
review := h.ChildText("h3.reviewTitle") + "\n"
review += h.ChildText("div:not([attr_all])") + "\n"
Expand All @@ -184,6 +191,7 @@ func (m *Metallum) ReviewsList(data *ScrapedData) ([]int, []string) {
return mReviewColWidths[:], mReviewColTitles[:]
}

// https://www.metal-archives.com/albums/Emperor/Anthems_to_the_Welkin_at_Dusk/92
func (m *Metallum) Credits() *orderedmap.OrderedMap[string, string] {
c := colly.NewCollector()
credits := orderedmap.New[string, string]()
Expand Down Expand Up @@ -212,8 +220,8 @@ func (m *Metallum) similarArtists(data *ScrapedData) ([]int, []string) {
})
data.Rows = append(data.Rows, row[:])
})

c.OnScraped(func(_ *colly.Response) { // This makes len(data.Rown) = len(data.Links) + 1 (see app.go)
// This makes len(data.Rows) = len(data.Links) + 1 (see app.go)
c.OnScraped(func(_ *colly.Response) {
data.Rows = append(data.Rows, []string{"Go back to choices", "", "", ""})
})

Expand Down
3 changes: 0 additions & 3 deletions scraper/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,11 @@ func credentials() (string, string, error) {
if err != nil {
return "", "", err
}

fmt.Print("Enter Password: ")
bytePassword, err := term.ReadPassword(int(syscall.Stdin))
if err != nil {
return "", "", err
}

password := string(bytePassword)
return strings.TrimSpace(username), strings.TrimSpace(password), nil
}
Expand Down Expand Up @@ -84,7 +82,6 @@ func SaveCookie(cookies map[string]string, path string) {
panic(err)
}
defer f.Close()

as_json, err := json.MarshalIndent(cookies, "", "\t")
if err != nil {
panic(err)
Expand Down
41 changes: 20 additions & 21 deletions scraper/rym.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ type RateYourMusic struct {
type AlbumTable struct {
section string
query string
t rune
t rune // see AlbumList. t defines the type of the album section ('s' for albums, 'e' for EPs, etc...)
rows [][]string
links []string
}
Expand All @@ -58,6 +58,7 @@ type albumQuery struct {
hasBio bool
}

// RYM requires an async crawler with delay limitation. Otherwise your IP will be banned.
func createCrawler(delay int, cookies map[string]string) *colly.Collector {
c := colly.NewCollector(colly.Async(true), colly.UserAgent(USERAGENT))
if delay > 0 {
Expand Down Expand Up @@ -101,6 +102,7 @@ func extractAlbumData(h *colly.HTMLElement, query string, section string, rows *
})
}

// https://rateyourmusic.com/search?searchterm=velvet%20underground&searchtype=a
func (r *RateYourMusic) SearchBand(data *ScrapedData) ([]int, []string) {
data.Links = make([]string, 0)
c := createCrawler(r.Delay, r.Cookies)
Expand All @@ -122,6 +124,11 @@ func (r *RateYourMusic) SearchBand(data *ScrapedData) ([]int, []string) {
return rBandColWidths[:], rBandColTitles[:]
}

// In both cases, we are scraping table containing album data. However, the Jquery strings to retrieved the data
// changes.
// AlbumTable is a struct for extracting album data from a particular section (Album, EP, credit...)
// tableQuery defines the wrapper (also a table) of all the tables of a page.
// hasBio is just to decide if we want to look for the artist's bio (which exists only in the main page).
var credits albumQuery = albumQuery{
albumTables: []AlbumTable{
{"Credits", "div.disco_search_results > div.disco_release", 'c', make([][]string, 0), make([]string, 0)},
Expand All @@ -142,14 +149,9 @@ var mainPage albumQuery = albumQuery{
hasBio: true,
}

var typeToSection map[rune]string = map[rune]string{
's': "Album",
'l': "Live Album",
'e': "EP",
'a': "Appears On",
'c': "Compilation",
}

// https://rateyourmusic.com/artist/the-velvet-underground
// https://rateyourmusic.com/artist/the-velvet-underground/credits/
// r.Expand does what pressing the "showing all" button does. It is applied to all the main artist page section.
func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) {
var query albumQuery
var visitLink string
Expand All @@ -174,10 +176,10 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) {
data.Metadata.Set("Biography", h.Text)
})
}

if !r.Expand || r.GetCredits {
c.OnHTML(query.tableQuery, func(h *colly.HTMLElement) {
for _, albumTable := range query.albumTables {
for i := range query.albumTables {
albumTable := &query.albumTables[i]
extractAlbumData(h, albumTable.query, albumTable.section, &albumTable.rows, &albumTable.links)
}
})
Expand All @@ -199,6 +201,8 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) {
h.Request.PostMultipart("https://rateyourmusic.com/httprequest/ExpandDiscographySection", expandForm)
}
})
// Body of the response is a Javascript function returning a object containing the expanded HTML. We parse
// the function, and get the substring that contains only the HTML. The HTML is parsed and data is extracted.
c.OnResponse(func(r *colly.Response) {
if r.Headers.Get("content-type") == "application/javascript; charset=utf-8" {
body := string(r.Body)
Expand All @@ -207,7 +211,7 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) {
if err != nil {
fmt.Println("Error on response")
}
albumType := body[strings.Index(body, "('")+2] // string after javascript function name
albumType := body[strings.Index(body, "('")+2] // char after javascript function header
albumTableIndex := slices.IndexFunc(query.albumTables, func(table AlbumTable) bool {
return table.t == rune(albumType)
})
Expand All @@ -232,6 +236,7 @@ func (r *RateYourMusic) AlbumList(data *ScrapedData) ([]int, []string) {
return rAlbumlistColWidths[:], rAlbumlistColTitles[:]
}

// https://rateyourmusic.com/release/album/the-velvet-underground-nico/the-velvet-underground-and-nico/
func (r *RateYourMusic) Album(data *ScrapedData) ([]int, []string) {
c := createCrawler(r.Delay, r.Cookies)
data.Metadata = orderedmap.New[string, string]()
Expand All @@ -240,7 +245,6 @@ func (r *RateYourMusic) Album(data *ScrapedData) ([]int, []string) {
image_url := h.ChildAttr("img", "src")
h.Request.Visit("https:" + image_url)
})

c.OnResponse(func(r *colly.Response) {
if r.Headers.Get("content-type") == "image/jpg" || r.Headers.Get("content-type") == "image/png" {
var err error
Expand All @@ -250,7 +254,6 @@ func (r *RateYourMusic) Album(data *ScrapedData) ([]int, []string) {
}
}
})

c.OnHTML("table.album_info > tbody > tr", func(h *colly.HTMLElement) {
key := h.ChildText("th")
value := strings.Join(strings.Fields(strings.ReplaceAll(h.ChildText("td"), "\n", "")), " ")
Expand All @@ -262,7 +265,6 @@ func (r *RateYourMusic) Album(data *ScrapedData) ([]int, []string) {
albumId := h.Attr("value")
data.Metadata.Set("ID", albumId[6:len(albumId)-1])
})

c.OnHTML("div#column_container_left div.section_tracklisting ul#tracks", func(h *colly.HTMLElement) {
h.ForEach("li.track", func(_ int, h *colly.HTMLElement) {
if len(h.ChildText("span.tracklist_total")) > 0 {
Expand All @@ -276,7 +278,6 @@ func (r *RateYourMusic) Album(data *ScrapedData) ([]int, []string) {
}
})
})

c.Visit(r.Link)
c.Wait()
return rAlbumColWidths[:], rAlbumColTitles[:]
Expand All @@ -290,21 +291,21 @@ func (r *RateYourMusic) SetLink(link string) {
r.Link = link
}

// https://rateyourmusic.com/release/album/the-velvet-underground-nico/the-velvet-underground-and-nico/reviews/1/
// Recursively scrape all reviews (may generate problems for very popular albums)
func (r *RateYourMusic) ReviewsList(data *ScrapedData) ([]int, []string) {
c := createCrawler(r.Delay, r.Cookies)
data.Links = make([]string, 0)

c.OnHTML("span.navspan a.navlinknext", func(h *colly.HTMLElement) {
h.Request.Visit(h.Attr("href"))
})

c.OnHTML("div.review > div.review_header ", func(h *colly.HTMLElement) {
user := h.ChildText("span.review_user")
date := h.ChildText("span.review_date")
rating := strings.Split(h.ChildAttr("span.review_rating > img", "alt"), " ")[0]
data.Rows = append(data.Rows, []string{user, date, rating})
})

c.OnHTML("div.review > div.review_body ", func(h *colly.HTMLElement) {
data.Links = append(data.Links, h.ChildText("span.rendered_text"))
})
Expand Down Expand Up @@ -346,6 +347,7 @@ var loginForm = map[string][]byte{
"action": []byte("Login"),
}

// For reference, inspect https://rateyourmusic.com/account/login
func (r *RateYourMusic) Login() {
user, password, err := credentials()
if err != nil {
Expand All @@ -360,15 +362,13 @@ func (r *RateYourMusic) Login() {
c.OnError(func(_ *colly.Response, err error) {
fmt.Println("Something went wrong:", err)
})

c.OnResponse(func(response *colly.Response) {
cookies := response.Headers.Values("Set-Cookie")
for _, cookieStr := range cookies {
cookie := strings.Split(strings.Split(cookieStr, "; ")[0], "=")
r.Cookies[cookie[0]] = cookie[1]
}
})

c.PostMultipart(LOGIN, loginForm)
c.Wait()
}
Expand All @@ -391,7 +391,6 @@ func (r *RateYourMusic) sendRating(rating string, id string) {
c.OnResponse(func(r *colly.Response) {
fmt.Println(r.StatusCode, "Vote has been uploaded.")
})

c.OnError(func(_ *colly.Response, err error) {
fmt.Println("Something went wrong:", err)
})
Expand Down

0 comments on commit 43a8d89

Please sign in to comment.