-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
155 lines (133 loc) · 5.03 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
package main
import (
"bufio"
"fmt"
"log"
"os"
"sitemapExport/crawler"
"sitemapExport/feed"
"sitemapExport/formatter"
"sitemapExport/writer"
"strings"
"github.com/spf13/cobra"
)
var (
feedURL string
cssSelector string
outputFilename string
outputFiletype string
format string
)
func main() {
if err := rootCmd.Execute(); err != nil {
handleError("executing command", err)
}
}
var rootCmd = &cobra.Command{
Use: "sitemapExport",
Short: "Crawl a sitemap or RSS feed and extract content.",
Run: executeCrawlAndExport, // Main function to run the command
}
func init() {
// Define flags in the init function
rootCmd.Flags().StringVarP(&feedURL, "url", "u", "", "Sitemap or RSS feed URL to crawl (required)")
rootCmd.Flags().StringVarP(&cssSelector, "css", "c", "body", "CSS selector to extract content (for sitemaps)")
rootCmd.Flags().StringVarP(&outputFilename, "filename", "n", "output", "Filename for the output")
rootCmd.Flags().StringVarP(&outputFiletype, "type", "t", "txt", "File output format (txt, json, jsonl, md, pdf)")
rootCmd.Flags().StringVarP(&format, "format", "f", "txt", "Content format transformation (html, md, txt)")
}
// executeCrawlAndExport prompts the user for missing input (if flags are not provided), validates the inputs, and runs the main export logic.
func executeCrawlAndExport(cmd *cobra.Command, args []string) {
// Prompt for missing user input
feedURL = promptUser("Enter the Sitemap or RSS feed URL (required): ", feedURL)
if feedURL == "" {
handleError("getting feed URL", fmt.Errorf("feed URL is required"))
}
cssSelector = promptUser("Enter the CSS selector to extract content (default: 'body'): ", cssSelector)
outputFilename = promptUser("Enter the output filename (default: 'output'): ", outputFilename)
// Validate output file type
outputFiletype = promptUser("Enter the output file type (txt, json, jsonl, md, pdf) (default: 'txt'): ", outputFiletype)
if !isValidOutputType(outputFiletype) {
handleError("validating output file type", fmt.Errorf("unsupported output file type: %s", outputFiletype))
}
// Validate content format
format = promptUser("Enter the content format (html, md, txt) (default: 'txt'): ", format)
if !isValidFormat(format) {
handleError("validating content format", fmt.Errorf("unsupported content format: %s", format))
}
// Confirm the input values with the user before proceeding
fmt.Printf("\nExport data with the following settings:\n")
fmt.Printf("URL: %s\n", feedURL)
fmt.Printf("CSS Selector: %s\n", cssSelector)
fmt.Printf("Output Filename: %s\n", outputFilename)
fmt.Printf("Output Filetype: %s\n", outputFiletype)
fmt.Printf("Format: %s\n", format)
confirmation := promptUser("Do you want to proceed with these settings? (y/n): ", "y")
if strings.ToLower(confirmation) != "y" {
fmt.Println("Operation cancelled.")
return
}
fmt.Print("\n")
// Step 1: Detect if it's an RSS feed or a Sitemap
feedType, err := feed.DetectFeedType(feedURL)
handleError("detecting feed type", err)
// Step 2: Fetch and crawl the pages based on the feed type
var pages []crawler.Page
switch feedType {
case "rss":
// Crawl RSS feed
pages, err = crawler.CrawlRSS(feedURL, cssSelector, format)
handleError("crawling RSS feed", err)
case "sitemap":
// Crawl Sitemap
pages, err = crawler.CrawlSitemap(feedURL, cssSelector, format)
handleError("crawling sitemap", err)
default:
handleError("processing feed", fmt.Errorf("unknown feed type detected"))
}
// Step 3: Format the extracted pages into the desired output file format
formattedContent, err := formatter.FormatPages(pages, outputFiletype)
handleError("formatting pages", err)
// Step 4: Write the formatted content to the specified output file
err = writer.WriteToFile(outputFilename, formattedContent, outputFiletype)
handleError("writing to file", err)
fmt.Printf("Successfully saved output to %s.%s\n", outputFilename, outputFiletype)
}
// promptUser is a helper function that asks for input, providing a default value if none is given.
func promptUser(message string, defaultValue string) string {
reader := bufio.NewReader(os.Stdin)
fmt.Print(message)
input, _ := reader.ReadString('\n')
input = strings.TrimSpace(input)
// If no input is provided, use the default value
if input == "" {
return defaultValue
}
return input
}
// handleError logs and terminates if an error occurs during a specific step.
func handleError(step string, err error) {
if err != nil {
log.Fatalf("Error %s: %v", step, err)
}
}
// isValidOutputType checks if the provided output filetype is supported.
func isValidOutputType(outputType string) bool {
supportedTypes := []string{"txt", "json", "jsonl", "md", "pdf"}
for _, t := range supportedTypes {
if strings.EqualFold(t, outputType) {
return true
}
}
return false
}
// isValidFormat checks if the provided content format transformation is supported.
func isValidFormat(format string) bool {
supportedFormats := []string{"html", "md", "txt"}
for _, f := range supportedFormats {
if strings.EqualFold(f, format) {
return true
}
}
return false
}