Skip to content

Commit

Permalink
template
Browse files Browse the repository at this point in the history
  • Loading branch information
kaki-admin committed Apr 19, 2024
1 parent c9484b5 commit ed5a4af
Show file tree
Hide file tree
Showing 54 changed files with 4,357 additions and 371 deletions.
149 changes: 119 additions & 30 deletions processor/rules.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package processor

import (
"fmt"
"net/url"
"strings"
)
Expand Down Expand Up @@ -43,19 +44,19 @@ var contentPredefinedRules = map[string]string{
"smbc-comics.com": "div#cc-comicbody, div#aftercomic",
"swordscomic.com": "img#comic-image, div#info-frame.tab-content-area",
"theoatmeal.com": "div#comic",
"theregister.com": "#top-col-story h2, #body",
"theverge.com": "h2.inline:nth-child(2),h2.duet--article--dangerously-set-cms-markup,figure.w-full,div.duet--article--article-body-component",
"turnoff.us": "article.post-content",
"universfreebox.com": "#corps_corps",
"version2.dk": "section.body",
"wdwnt.com": "div.entry-content",
"wired.com": "div.grid-layout__content",
"zeit.de": ".summary, .article-body",
"zdnet.com": "div.storyBody",
"pbfcomics": "div#comic",
"yahoo.com": "div.caas-body",
"kyivindependent.com": "div.c-content",
"news.mit.edu": "div.news-article--content--body--inner",
//"theregister.com": "#top-col-story h2, #body",
//"theverge.com": "h2.inline:nth-child(2),h2.duet--article--dangerously-set-cms-markup,figure.w-full,div.duet--article--article-body-component",
"turnoff.us": "article.post-content",
"universfreebox.com": "#corps_corps",
"version2.dk": "section.body",
"wdwnt.com": "div.entry-content",
"wired.com": "div.grid-layout__content",
"zeit.de": ".summary, .article-body",
"zdnet.com": "div.storyBody",
"pbfcomics": "div#comic",
"yahoo.com": "div.caas-body",
"kyivindependent.com": "div.c-content",
"news.mit.edu": "div.news-article--content--body--inner",
}

var contentPostExtractorTemplateRules = map[string]string{
Expand Down Expand Up @@ -87,9 +88,55 @@ var contentTemplatePredefinedRules = map[string]string{
"abc.net.au": "AbcNetAUScrapContent",
"yahoo.com": "YahoocrapContent",
"nbcnews.com": "NbcNewsScrapContent",
"cncf.io": "CNCFScrapContent",
"deepmind.google": "DeepMindScrapContent",
"digitaltrends.com": "DigitalTrendsScrapContent",
"nypost.com": "NYpostScrapContent",
"techcrunch.com": "TechCrunchScrapContent",
"theverge.com": "ThevergeScrapContent",

"theregister.com": "TheRegisterScrapContent",
"dazeddigital.com": "DazeddigitalScrapContent",
"deadline.com": "DeadlineScrapContent",
//"eonline.com": "EOnlineScrapContent",
"filmschoolrejects.com": "FilmSchoolRejectsScrapContent",
"independent.co.uk": "IndependentUKScrapContent",
"skysports.com": "SkySportsScrapContent",
"sbnation.com": "SbnationScrapContent",
"cbssports.com": "CBSsportsScrapContent",
"scmp.com": "SCMPScrapContent",
"cnn.com": "CNNScrapContent",
"businesslive.co.za": "BusinessLiveScrapContent",
"smallbiztrends.com": "SmallBizTrendsScrapContent",
"hbr.org": "HBRScrapContent",
"entrepreneur.com": "EntrepreneurScrapContent",
"businessinsider.com": "BusinessInsiderScrapContent",
"mattturck.com": "MattturckScrapContent",
"cfainstitute.org": "CFainstituteScrapContent",
"hunterwalk.com": "HunterWalkScrapContent",
"thisisgoingtobebig.com": "ThisisGoingtobeBIGScrapContent",
"ign.com": "IGNScrapContent",
"screenrant.com": "ScreenrantScrapContent",
"vice.com": "ViceScrapContent",
"variety.com": "VarietyScrapContent",
"avclub.com": "AVClubScrapContent",
"stereogum.com": "StereogumScrapContent",
"pitchfork.com": "PitchForkScrapContent",
"geektyrant.com": "GeektyrantScrapContent",
"advanced-television.com": "AdTelevisionScrapContent",
"bleacherreport.com": "BleadherReportScrapContent",

/*"espn.com": "EspnScrapContent",
"foxsports.com": "FoxSportsScrapContent",
"nbcsports.com": "NBCSportsScrapContent",
"deadspin.com": "DeadspinScrapContent",
"skynews.com": "SkyNewsScrapContent",
*/
}

var metadataTemplatePredefinedRules = map[string]string{
"eonline.com": "EonlineScrapMetaData",
"slashfilm.com": "SlashfilmScrapMetaData",
"abcnews.go.com": "AbcNewsScrapMetaData",
"apnews.com": "ApnNewsScrapMetaData",
"www.aljazeera.com": "AljazeeraScrapMetaData",
Expand All @@ -108,22 +155,41 @@ var metadataTemplatePredefinedRules = map[string]string{
"themoscowtimes.com": "ThemoscowtimesScrapMetaData",
"themessenger.com": "ThemessengerScrapMetaData",
"theguardian.com": "TheguardianScrapMetaData",
"bbc.com": "BBCScrapMetaData",
".bbc.co.": "BBCScrapMetaData",
"time.com": "TimesScrapMetaData",
// "bbc.com": "BBCScrapMetaData",
// ".bbc.co.": "BBCScrapMetaData",
"www.bbc.co.uk/news": "BBCNewsScrapMetaData",
"time.com": "TimesScrapMetaData",
"npr.org": "NprScrapMetaData",
"stereogum.com": "StereogumScrapMetaData",
"www.vice.com": "ViceScrapMetaData",
"a16z.com": "A16ZScrapMetaData",
"a16zcrypto.com": "A16ZCrptoScrapMetaData",
"businessinsider.com": "BusinessinsiderScrapMetaData",
"foxbusiness.com": "FoxbusinessScrapMetaData",
"businesslive.co.za": "BusinessliveScrapMetaData",
"edition.cnn.com": "EditionCnnScrapMetaData",
"money.cnn.com": "EditionCnnScrapMetaData",
"skysports.com": "SkySportsScrapMetaData",
"www.bbc.com/sport": "BBCSportsScrapMetaData",
"www.bbc.co.uk/sport": "BBCSportsScrapMetaData",
"cbssports.com": "CBSSportsScrapMetaData",
".espn.com": "ESPNScrapMetaData",
"foxsports.com": "FoxsportsScrapMetaData",
"hbr.org": "HBRScrapMetaData",
}

var publishedAtTimeStampTemplatePredefinedRules = map[string]string {
var publishedAtTimeStampTemplatePredefinedRules = map[string]string{
"slashfilm.com": "SlashfilmNewsPublishedAtTimeFromScriptMetadata",
"abcnews.go.com": "CommonGetPublishedAtTimestampSingleJson",
"apnews.com": "CommonGetPublishedAtTimestampSingleJson",
"apnews.com": "ApNewsCommonGetPublishedAtTimestamp",
"www.aljazeera.com": "CommonGetPublishedAtTimestampSingleJson",
"news.sky.com": "CommonGetPublishedAtTimestampSingleJson",
"news.sky.com": "SkyNewsPublishedAtTimeFromScriptMetadata",
"yahoo.com": "CommonGetPublishedAtTimestampSingleJson",
"abc.net.au": "CommonGetPublishedAtTimestampSingleJson",
"cbsnews.com": "CbsnewsWorldGetPublishedAtTimestampSingleJson",
"cnbc.com": "CnbcWorldGetPublishedAtTimestampSingleJson",
"cnbc.com": "CnbcPublishedAtTimeFromScriptMetadata",
"dw.com": "CommonGetPublishedAtTimestampSingleJson",
"euronews.com": "EuroNewsGetPublishedAtTimeStamp",
"euronews.com": "EuroNewsGetPublishedAtTimeStampStruct",
"foxnews.com": "CommonGetPublishedAtTimestampSingleJson",
"huffpost.com": "CommonGetPublishedAtTimestampSingleJson",
"nbcnews.com": "CommonGetPublishedAtTimestampSingleJson",
Expand All @@ -132,16 +198,36 @@ var publishedAtTimeStampTemplatePredefinedRules = map[string]string {
"themoscowtimes.com": "CommonGetPublishedAtTimestampSingleJson",
"themessenger.com": "TheMessengerGetPublishedAtTimestampSingleJson",
"theguardian.com": "CommonGetPublishedAtTimestampMultipleJson",
"bbc.com": "CommonGetPublishedAtTimestampSingleJson",
".bbc.co.": "CommonGetPublishedAtTimestampSingleJson",
"time.com": "CommonGetPublishedAtTimestampMultipleJson",

// "bbc.com": "CommonGetPublishedAtTimestampSingleJson",
// ".bbc.co.": "CommonGetPublishedAtTimestampSingleJson",
"www.bbc.co.uk/news": "BBCNewsPublishedAtTimeFromScriptMetadata",
"time.com": "CommonGetPublishedAtTimestampMultipleJson",
"eonline.com": "EonlinePublishedAtTimeFromScriptMetadata",
"npr.org": "NprPublishedAtTimeFromScriptMetadata",
"stereogum.com": "StereogumPublishedAtTimeFromScriptMetadata",
"www.vice.com": "VicePublishedAtTimeFromScriptMetadata",
"a16z.com": "A16ZPublishedAtTimeFromScriptMetadata",
"businessinsider.com": "BusinessinsiderPublishedAtTimeFromScriptMetadata",
"foxbusiness.com": "FoxbusinessPublishedAtTimeFromScriptMetadata",
"businesslive.co.za": "BusinesslivePublishedAtTimeFromScriptMetadata",
"edition.cnn.com": "EditionCnnPublishedAtTimeFromScriptMetadata",
"money.cnn.com": "EditionCnnPublishedAtTimeFromScriptMetadata",
"skysports.com": "SkySportsPublishedAtTimeFromScriptMetadata",
"www.bbc.com/sport": "BBCSportsPublishedAtTimeFromScriptMetadata",
"www.bbc.co.uk/sport": "BBCSportsPublishedAtTimeFromScriptMetadata",
"cbssports.com": "CBSSportPublishedAtTimeFromScriptMetadata",
"espn.com": "ESPNPublishedAtTimeFromScriptMetadata",
"foxsports.com": "FoxsportsPublishedAtTimeFromScriptMetadata",
"hbr.org": "HBRPublishedAtTimeFromScriptMetadata",
}


func getPredefinedPublishedAtTimestampTemplateRules(websiteURL string) (string, string) {
urlDomain := domain(websiteURL)

for domain, rules := range publishedAtTimeStampTemplatePredefinedRules {
if strings.Contains(websiteURL, domain) {
return domain, rules
}
}
for domain, rules := range publishedAtTimeStampTemplatePredefinedRules {
if strings.Contains(urlDomain, domain) {
return domain, rules
Expand All @@ -150,7 +236,6 @@ func getPredefinedPublishedAtTimestampTemplateRules(websiteURL string) (string,
return "", ""
}


func getContentPostExtractorTemplateRules(websiteURL string) string {
urlDomain := domain(websiteURL)
for url, rules := range contentPostExtractorTemplateRules {
Expand All @@ -174,7 +259,6 @@ func getPredefinedScraperRules(websiteURL string) (string, string) {

func getPredefinedContentTemplateRules(websiteURL string) (string, string) {
urlDomain := domain(websiteURL)

for domain, rules := range contentTemplatePredefinedRules {
if strings.Contains(urlDomain, domain) {
return domain, rules
Expand All @@ -185,7 +269,12 @@ func getPredefinedContentTemplateRules(websiteURL string) (string, string) {

func getPredefinedMetaDataTemplateRules(websiteURL string) (string, string) {
urlDomain := domain(websiteURL)

fmt.Printf("+++++++++++++++++++++ websiteURL %s\n",websiteURL)
for domain, rules := range metadataTemplatePredefinedRules {
if strings.Contains(websiteURL, domain) {
return domain, rules
}
}
for domain, rules := range metadataTemplatePredefinedRules {
if strings.Contains(urlDomain, domain) {
return domain, rules
Expand Down
Loading

0 comments on commit ed5a4af

Please sign in to comment.