-
Notifications
You must be signed in to change notification settings - Fork 0
feat: Reddit thread/subreddit archival #84
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| package cmd | ||
|
|
||
| import ( | ||
| "github.com/spf13/cobra" | ||
| ) | ||
|
|
||
| // collectRedditCmd represents the collect reddit command | ||
| var collectRedditCmd = NewCollectRedditCmd() | ||
|
|
||
| func init() { | ||
| GetCollectCmd().AddCommand(GetCollectRedditCmd()) | ||
| } | ||
|
|
||
| func NewCollectRedditCmd() *cobra.Command { | ||
| return &cobra.Command{ | ||
| Use: "reddit", | ||
| Short: "Collect a resource from Reddit.", | ||
| Long: `Collect a resource from Reddit and store it in a DataNode.`, | ||
| } | ||
| } | ||
|
|
||
| func GetCollectRedditCmd() *cobra.Command { | ||
| return collectRedditCmd | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,126 @@ | ||
| package cmd | ||
|
|
||
| import ( | ||
| "fmt" | ||
| "os" | ||
| "strings" | ||
|
|
||
| "github.com/Snider/Borg/pkg/compress" | ||
| "github.com/Snider/Borg/pkg/datanode" | ||
| "github.com/Snider/Borg/pkg/reddit" | ||
| "github.com/Snider/Borg/pkg/tim" | ||
| "github.com/Snider/Borg/pkg/trix" | ||
| "github.com/spf13/cobra" | ||
| ) | ||
|
|
||
| // collectRedditSubredditCmd represents the collect reddit subreddit command | ||
| var collectRedditSubredditCmd = NewCollectRedditSubredditCmd() | ||
|
|
||
| func init() { | ||
| GetCollectRedditCmd().AddCommand(GetCollectRedditSubredditCmd()) | ||
| } | ||
|
|
||
| func GetCollectRedditSubredditCmd() *cobra.Command { | ||
| return collectRedditSubredditCmd | ||
| } | ||
|
|
||
| func NewCollectRedditSubredditCmd() *cobra.Command { | ||
| collectRedditSubredditCmd := &cobra.Command{ | ||
| Use: "subreddit [name]", | ||
| Short: "Collect a subreddit's top posts", | ||
| Long: `Collect a subreddit's top posts and store them in a DataNode.`, | ||
| Args: cobra.ExactArgs(1), | ||
| RunE: func(cmd *cobra.Command, args []string) error { | ||
| subredditName := args[0] | ||
| outputFile, _ := cmd.Flags().GetString("output") | ||
| limit, _ := cmd.Flags().GetInt("limit") | ||
| sort, _ := cmd.Flags().GetString("sort") | ||
| format, _ := cmd.Flags().GetString("format") | ||
| compression, _ := cmd.Flags().GetString("compression") | ||
| password, _ := cmd.Flags().GetString("password") | ||
|
|
||
| if format != "datanode" && format != "tim" && format != "trix" { | ||
| return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format) | ||
| } | ||
|
Comment on lines
+42
to
+44
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The allowed format types ( Example: // At package level
const (
formatDataNode = "datanode"
formatTIM = "tim"
formatTRIX = "trix"
)
// In RunE
if format != formatDataNode && format != formatTIM && format != formatTRIX {
return fmt.Errorf("invalid format: %s (must be '%s', '%s', or '%s')", format, formatDataNode, formatTIM, formatTRIX)
} |
||
|
|
||
| threads, err := reddit.ScrapeSubreddit(subredditName, sort, limit) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to scrape subreddit: %w", err) | ||
| } | ||
|
|
||
| dn := datanode.New() | ||
| for _, threadStub := range threads { | ||
| thread, err := reddit.ScrapeThread(threadStub.URL) | ||
| if err != nil { | ||
| // It's better to log the error and continue | ||
| fmt.Fprintf(cmd.ErrOrStderr(), "failed to scrape thread %s: %v\n", threadStub.URL, err) | ||
| continue | ||
| } | ||
|
|
||
| var builder strings.Builder | ||
| builder.WriteString(fmt.Sprintf("# %s\n\n", thread.Title)) | ||
| builder.WriteString(fmt.Sprintf("%s\n\n", thread.Post)) | ||
| for _, comment := range thread.Comments { | ||
| builder.WriteString(fmt.Sprintf("## %s\n\n", comment.Author)) | ||
| builder.WriteString(fmt.Sprintf("%s\n\n", comment.Body)) | ||
| } | ||
| // Sanitize filename | ||
| filename := strings.ReplaceAll(thread.Title, " ", "_") | ||
| filename = strings.ReplaceAll(filename, "/", "_") | ||
|
Comment on lines
+68
to
+69
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current filename sanitization is basic and only replaces spaces and forward slashes. Reddit titles can contain various other characters that are invalid in filenames on different operating systems (e.g., Consider using a regular expression to replace any character that is not a letter, number, underscore, dot, or hyphen with an underscore. Example: import "regexp"
// ...
var re = regexp.MustCompile(`[^\w.-]`)
filename := re.ReplaceAllString(thread.Title, "_") |
||
| err = dn.AddData(fmt.Sprintf("r-%s/posts/%s.md", subredditName, filename), []byte(builder.String())) | ||
|
Check failure on line 70 in cmd/collect_reddit_subreddit.go
|
||
| if err != nil { | ||
| return fmt.Errorf("error adding data to DataNode: %w", err) | ||
| } | ||
| } | ||
|
|
||
| var data []byte | ||
| if format == "tim" { | ||
| tim, err := tim.FromDataNode(dn) | ||
| if err != nil { | ||
| return fmt.Errorf("error creating tim: %w", err) | ||
| } | ||
| data, err = tim.ToTar() | ||
| if err != nil { | ||
| return fmt.Errorf("error serializing tim: %w", err) | ||
| } | ||
| } else if format == "trix" { | ||
| data, err = trix.ToTrix(dn, password) | ||
| if err != nil { | ||
| return fmt.Errorf("error serializing trix: %w", err) | ||
| } | ||
| } else { | ||
| data, err = dn.ToTar() | ||
| if err != nil { | ||
| return fmt.Errorf("error serializing DataNode: %w", err) | ||
| } | ||
| } | ||
|
|
||
| compressedData, err := compress.Compress(data, compression) | ||
| if err != nil { | ||
| return fmt.Errorf("error compressing data: %w", err) | ||
| } | ||
|
|
||
| if outputFile == "" { | ||
| outputFile = "subreddit." + format | ||
| if compression != "none" { | ||
| outputFile += "." + compression | ||
| } | ||
| } | ||
|
|
||
| err = os.WriteFile(outputFile, compressedData, 0644) | ||
| if err != nil { | ||
| return fmt.Errorf("error writing subreddit to file: %w", err) | ||
| } | ||
|
|
||
| fmt.Fprintln(cmd.OutOrStdout(), "Subreddit saved to", outputFile) | ||
| return nil | ||
| }, | ||
|
Comment on lines
+33
to
+117
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic within this Additionally, the markdown generation logic is duplicated across |
||
| } | ||
| collectRedditSubredditCmd.PersistentFlags().String("output", "", "Output file for the DataNode") | ||
| collectRedditSubredditCmd.PersistentFlags().Int("limit", 100, "Number of posts to collect") | ||
| collectRedditSubredditCmd.PersistentFlags().String("sort", "top", "Sort order for posts (top, new)") | ||
| collectRedditSubredditCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)") | ||
| collectRedditSubredditCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)") | ||
| collectRedditSubredditCmd.PersistentFlags().String("password", "", "Password for encryption") | ||
| return collectRedditSubredditCmd | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| package cmd |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,112 @@ | ||
| package cmd | ||
|
|
||
| import ( | ||
| "fmt" | ||
| "os" | ||
| "strings" | ||
|
|
||
| "github.com/Snider/Borg/pkg/compress" | ||
| "github.com/Snider/Borg/pkg/datanode" | ||
| "github.com/Snider/Borg/pkg/reddit" | ||
| "github.com/Snider/Borg/pkg/tim" | ||
| "github.com/Snider/Borg/pkg/trix" | ||
| "github.com/spf13/cobra" | ||
| ) | ||
|
|
||
| // collectRedditThreadCmd represents the collect reddit thread command | ||
| var collectRedditThreadCmd = NewCollectRedditThreadCmd() | ||
|
|
||
| func init() { | ||
| GetCollectRedditCmd().AddCommand(GetCollectRedditThreadCmd()) | ||
| } | ||
|
|
||
| func GetCollectRedditThreadCmd() *cobra.Command { | ||
| return collectRedditThreadCmd | ||
| } | ||
|
|
||
| func NewCollectRedditThreadCmd() *cobra.Command { | ||
| collectRedditThreadCmd := &cobra.Command{ | ||
| Use: "thread [url]", | ||
| Short: "Collect a single Reddit thread", | ||
| Long: `Collect a single Reddit thread and store it in a DataNode.`, | ||
| Args: cobra.ExactArgs(1), | ||
| RunE: func(cmd *cobra.Command, args []string) error { | ||
| threadURL := args[0] | ||
| outputFile, _ := cmd.Flags().GetString("output") | ||
| format, _ := cmd.Flags().GetString("format") | ||
| compression, _ := cmd.Flags().GetString("compression") | ||
| password, _ := cmd.Flags().GetString("password") | ||
|
|
||
| if format != "datanode" && format != "tim" && format != "trix" { | ||
| return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format) | ||
| } | ||
|
|
||
| thread, err := reddit.ScrapeThread(threadURL) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to scrape thread: %w", err) | ||
| } | ||
|
|
||
| // Convert thread to Markdown | ||
| var builder strings.Builder | ||
| builder.WriteString(fmt.Sprintf("# %s\n\n", thread.Title)) | ||
| builder.WriteString(fmt.Sprintf("%s\n\n", thread.Post)) | ||
| for _, comment := range thread.Comments { | ||
| builder.WriteString(fmt.Sprintf("## %s\n\n", comment.Author)) | ||
| builder.WriteString(fmt.Sprintf("%s\n\n", comment.Body)) | ||
| } | ||
|
|
||
| dn := datanode.New() | ||
| err = dn.AddData("thread.md", []byte(builder.String())) | ||
|
Check failure on line 59 in cmd/collect_reddit_thread.go
|
||
| if err != nil { | ||
| return fmt.Errorf("error adding data to DataNode: %w", err) | ||
| } | ||
|
|
||
| var data []byte | ||
| if format == "tim" { | ||
| tim, err := tim.FromDataNode(dn) | ||
| if err != nil { | ||
| return fmt.Errorf("error creating tim: %w", err) | ||
| } | ||
| data, err = tim.ToTar() | ||
| if err != nil { | ||
| return fmt.Errorf("error serializing tim: %w", err) | ||
| } | ||
| } else if format == "trix" { | ||
| data, err = trix.ToTrix(dn, password) | ||
| if err != nil { | ||
| return fmt.Errorf("error serializing trix: %w", err) | ||
| } | ||
| } else { | ||
| data, err = dn.ToTar() | ||
| if err != nil { | ||
| return fmt.Errorf("error serializing DataNode: %w", err) | ||
| } | ||
| } | ||
|
|
||
| compressedData, err := compress.Compress(data, compression) | ||
| if err != nil { | ||
| return fmt.Errorf("error compressing data: %w", err) | ||
| } | ||
|
|
||
| if outputFile == "" { | ||
| outputFile = "thread." + format | ||
| if compression != "none" { | ||
| outputFile += "." + compression | ||
| } | ||
| } | ||
|
|
||
| err = os.WriteFile(outputFile, compressedData, 0644) | ||
| if err != nil { | ||
| return fmt.Errorf("error writing thread to file: %w", err) | ||
| } | ||
|
|
||
| fmt.Fprintln(cmd.OutOrStdout(), "Thread saved to", outputFile) | ||
| return nil | ||
| }, | ||
| } | ||
| collectRedditThreadCmd.PersistentFlags().String("output", "", "Output file for the DataNode") | ||
| collectRedditThreadCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)") | ||
| collectRedditThreadCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)") | ||
| collectRedditThreadCmd.PersistentFlags().String("password", "", "Password for encryption") | ||
| return collectRedditThreadCmd | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,122 @@ | ||
| package cmd | ||
|
|
||
| import ( | ||
| "fmt" | ||
| "os" | ||
| "strings" | ||
|
|
||
| "github.com/Snider/Borg/pkg/compress" | ||
| "github.com/Snider/Borg/pkg/datanode" | ||
| "github.com/Snider/Borg/pkg/reddit" | ||
| "github.com/Snider/Borg/pkg/tim" | ||
| "github.com/Snider/Borg/pkg/trix" | ||
| "github.com/spf13/cobra" | ||
| ) | ||
|
|
||
| // collectRedditUserCmd represents the collect reddit user command | ||
| var collectRedditUserCmd = NewCollectRedditUserCmd() | ||
|
|
||
| func init() { | ||
| GetCollectRedditCmd().AddCommand(GetCollectRedditUserCmd()) | ||
| } | ||
|
|
||
| func GetCollectRedditUserCmd() *cobra.Command { | ||
| return collectRedditUserCmd | ||
| } | ||
|
|
||
| func NewCollectRedditUserCmd() *cobra.Command { | ||
| collectRedditUserCmd := &cobra.Command{ | ||
| Use: "user [name]", | ||
| Short: "Collect a user's posts", | ||
| Long: `Collect a user's posts and store them in a DataNode.`, | ||
| Args: cobra.ExactArgs(1), | ||
| RunE: func(cmd *cobra.Command, args []string) error { | ||
| userName := args[0] | ||
| outputFile, _ := cmd.Flags().GetString("output") | ||
| format, _ := cmd.Flags().GetString("format") | ||
| compression, _ := cmd.Flags().GetString("compression") | ||
| password, _ := cmd.Flags().GetString("password") | ||
|
|
||
| if format != "datanode" && format != "tim" && format != "trix" { | ||
| return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format) | ||
| } | ||
|
|
||
| threads, err := reddit.ScrapeUser(userName) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to scrape user: %w", err) | ||
| } | ||
|
|
||
| dn := datanode.New() | ||
| for _, threadStub := range threads { | ||
| thread, err := reddit.ScrapeThread(threadStub.URL) | ||
| if err != nil { | ||
| // It's better to log the error and continue | ||
| fmt.Fprintf(cmd.ErrOrStderr(), "failed to scrape thread %s: %v\n", threadStub.URL, err) | ||
| continue | ||
| } | ||
|
|
||
| var builder strings.Builder | ||
| builder.WriteString(fmt.Sprintf("# %s\n\n", thread.Title)) | ||
| builder.WriteString(fmt.Sprintf("%s\n\n", thread.Post)) | ||
| for _, comment := range thread.Comments { | ||
| builder.WriteString(fmt.Sprintf("## %s\n\n", comment.Author)) | ||
| builder.WriteString(fmt.Sprintf("%s\n\n", comment.Body)) | ||
| } | ||
| // Sanitize filename | ||
| filename := strings.ReplaceAll(thread.Title, " ", "_") | ||
| filename = strings.ReplaceAll(filename, "/", "_") | ||
| err = dn.AddData(fmt.Sprintf("u-%s/posts/%s.md", userName, filename), []byte(builder.String())) | ||
|
Check failure on line 68 in cmd/collect_reddit_user.go
|
||
| if err != nil { | ||
| return fmt.Errorf("error adding data to DataNode: %w", err) | ||
| } | ||
| } | ||
|
|
||
| var data []byte | ||
| if format == "tim" { | ||
| tim, err := tim.FromDataNode(dn) | ||
| if err != nil { | ||
| return fmt.Errorf("error creating tim: %w", err) | ||
| } | ||
| data, err = tim.ToTar() | ||
| if err != nil { | ||
| return fmt.Errorf("error serializing tim: %w", err) | ||
| } | ||
| } else if format == "trix" { | ||
| data, err = trix.ToTrix(dn, password) | ||
| if err != nil { | ||
| return fmt.Errorf("error serializing trix: %w", err) | ||
| } | ||
| } else { | ||
| data, err = dn.ToTar() | ||
| if err != nil { | ||
| return fmt.Errorf("error serializing DataNode: %w", err) | ||
| } | ||
| } | ||
|
|
||
| compressedData, err := compress.Compress(data, compression) | ||
| if err != nil { | ||
| return fmt.Errorf("error compressing data: %w", err) | ||
| } | ||
|
|
||
| if outputFile == "" { | ||
| outputFile = "user." + format | ||
| if compression != "none" { | ||
| outputFile += "." + compression | ||
| } | ||
| } | ||
|
|
||
| err = os.WriteFile(outputFile, compressedData, 0644) | ||
| if err != nil { | ||
| return fmt.Errorf("error writing user to file: %w", err) | ||
| } | ||
|
|
||
| fmt.Fprintln(cmd.OutOrStdout(), "User posts saved to", outputFile) | ||
| return nil | ||
| }, | ||
| } | ||
| collectRedditUserCmd.PersistentFlags().String("output", "", "Output file for the DataNode") | ||
| collectRedditUserCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)") | ||
| collectRedditUserCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)") | ||
| collectRedditUserCmd.PersistentFlags().String("password", "", "Password for encryption") | ||
| return collectRedditUserCmd | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Errors returned by
cmd.Flags().Get...functions are being ignored. This can lead to unexpected behavior if a flag is not found or has an incorrect type. This is a critical issue that can cause the program to fail silently or operate with incorrect default values. Please handle these errors properly. This issue is also present incmd/collect_reddit_thread.goandcmd/collect_reddit_user.go.