Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions cmd/collect_reddit.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package cmd

import (
"github.com/spf13/cobra"
)

// collectRedditCmd represents the collect reddit command
var collectRedditCmd = NewCollectRedditCmd()

func init() {
GetCollectCmd().AddCommand(GetCollectRedditCmd())
}

func NewCollectRedditCmd() *cobra.Command {
return &cobra.Command{
Use: "reddit",
Short: "Collect a resource from Reddit.",
Long: `Collect a resource from Reddit and store it in a DataNode.`,
}
}

func GetCollectRedditCmd() *cobra.Command {
return collectRedditCmd
}
126 changes: 126 additions & 0 deletions cmd/collect_reddit_subreddit.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package cmd

import (
"fmt"
"os"
"strings"

"github.com/Snider/Borg/pkg/compress"
"github.com/Snider/Borg/pkg/datanode"
"github.com/Snider/Borg/pkg/reddit"
"github.com/Snider/Borg/pkg/tim"
"github.com/Snider/Borg/pkg/trix"
"github.com/spf13/cobra"
)

// collectRedditSubredditCmd represents the collect reddit subreddit command
var collectRedditSubredditCmd = NewCollectRedditSubredditCmd()

func init() {
GetCollectRedditCmd().AddCommand(GetCollectRedditSubredditCmd())
}

func GetCollectRedditSubredditCmd() *cobra.Command {
return collectRedditSubredditCmd
}

func NewCollectRedditSubredditCmd() *cobra.Command {
collectRedditSubredditCmd := &cobra.Command{
Use: "subreddit [name]",
Short: "Collect a subreddit's top posts",
Long: `Collect a subreddit's top posts and store them in a DataNode.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
subredditName := args[0]
outputFile, _ := cmd.Flags().GetString("output")
limit, _ := cmd.Flags().GetInt("limit")
sort, _ := cmd.Flags().GetString("sort")
format, _ := cmd.Flags().GetString("format")
compression, _ := cmd.Flags().GetString("compression")
password, _ := cmd.Flags().GetString("password")
Comment on lines +35 to +40

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Errors returned by cmd.Flags().Get... functions are being ignored. This can lead to unexpected behavior if a flag is not found or has an incorrect type. This is a critical issue that can cause the program to fail silently or operate with incorrect default values. Please handle these errors properly. This issue is also present in cmd/collect_reddit_thread.go and cmd/collect_reddit_user.go.

outputFile, err := cmd.Flags().GetString("output")
			if err != nil {
				return fmt.Errorf("could not parse 'output' flag: %w", err)
			}
			limit, err := cmd.Flags().GetInt("limit")
			if err != nil {
				return fmt.Errorf("could not parse 'limit' flag: %w", err)
			}
			sort, err := cmd.Flags().GetString("sort")
			if err != nil {
				return fmt.Errorf("could not parse 'sort' flag: %w", err)
			}
			format, err := cmd.Flags().GetString("format")
			if err != nil {
				return fmt.Errorf("could not parse 'format' flag: %w", err)
			}
			compression, err := cmd.Flags().GetString("compression")
			if err != nil {
				return fmt.Errorf("could not parse 'compression' flag: %w", err)
			}
			password, err := cmd.Flags().GetString("password")
			if err != nil {
				return fmt.Errorf("could not parse 'password' flag: %w", err)
			}


if format != "datanode" && format != "tim" && format != "trix" {
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
}
Comment on lines +42 to +44

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The allowed format types (datanode, tim, trix) are used as magic strings. It would be better to define these as constants at the package level to avoid typos and improve maintainability. You could also create a helper function or a map to validate the format more cleanly.

Example:

// At package level
const (
    formatDataNode = "datanode"
    formatTIM      = "tim"
    formatTRIX     = "trix"
)

// In RunE
if format != formatDataNode && format != formatTIM && format != formatTRIX {
    return fmt.Errorf("invalid format: %s (must be '%s', '%s', or '%s')", format, formatDataNode, formatTIM, formatTRIX)
}


threads, err := reddit.ScrapeSubreddit(subredditName, sort, limit)
if err != nil {
return fmt.Errorf("failed to scrape subreddit: %w", err)
}

dn := datanode.New()
for _, threadStub := range threads {
thread, err := reddit.ScrapeThread(threadStub.URL)
if err != nil {
// It's better to log the error and continue
fmt.Fprintf(cmd.ErrOrStderr(), "failed to scrape thread %s: %v\n", threadStub.URL, err)
continue
}

var builder strings.Builder
builder.WriteString(fmt.Sprintf("# %s\n\n", thread.Title))
builder.WriteString(fmt.Sprintf("%s\n\n", thread.Post))
for _, comment := range thread.Comments {
builder.WriteString(fmt.Sprintf("## %s\n\n", comment.Author))
builder.WriteString(fmt.Sprintf("%s\n\n", comment.Body))
}
// Sanitize filename
filename := strings.ReplaceAll(thread.Title, " ", "_")
filename = strings.ReplaceAll(filename, "/", "_")
Comment on lines +68 to +69

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The current filename sanitization is basic and only replaces spaces and forward slashes. Reddit titles can contain various other characters that are invalid in filenames on different operating systems (e.g., \, :, *, ?, ", <, >, |). This could cause errors when writing files.

Consider using a regular expression to replace any character that is not a letter, number, underscore, dot, or hyphen with an underscore.

Example:

import "regexp"

// ...

var re = regexp.MustCompile(`[^\w.-]`)
filename := re.ReplaceAllString(thread.Title, "_")

err = dn.AddData(fmt.Sprintf("r-%s/posts/%s.md", subredditName, filename), []byte(builder.String()))

Check failure on line 70 in cmd/collect_reddit_subreddit.go

View workflow job for this annotation

GitHub Actions / build

dn.AddData(fmt.Sprintf("r-%s/posts/%s.md", subredditName, filename), []byte(builder.String())) (no value) used as value

Check failure on line 70 in cmd/collect_reddit_subreddit.go

View workflow job for this annotation

GitHub Actions / build

dn.AddData(fmt.Sprintf("r-%s/posts/%s.md", subredditName, filename), []byte(builder.String())) (no value) used as value
if err != nil {
return fmt.Errorf("error adding data to DataNode: %w", err)
}
}

var data []byte
if format == "tim" {
tim, err := tim.FromDataNode(dn)
if err != nil {
return fmt.Errorf("error creating tim: %w", err)
}
data, err = tim.ToTar()
if err != nil {
return fmt.Errorf("error serializing tim: %w", err)
}
} else if format == "trix" {
data, err = trix.ToTrix(dn, password)
if err != nil {
return fmt.Errorf("error serializing trix: %w", err)
}
} else {
data, err = dn.ToTar()
if err != nil {
return fmt.Errorf("error serializing DataNode: %w", err)
}
}

compressedData, err := compress.Compress(data, compression)
if err != nil {
return fmt.Errorf("error compressing data: %w", err)
}

if outputFile == "" {
outputFile = "subreddit." + format
if compression != "none" {
outputFile += "." + compression
}
}

err = os.WriteFile(outputFile, compressedData, 0644)
if err != nil {
return fmt.Errorf("error writing subreddit to file: %w", err)
}

fmt.Fprintln(cmd.OutOrStdout(), "Subreddit saved to", outputFile)
return nil
},
Comment on lines +33 to +117

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The logic within this RunE function is nearly identical to the one in cmd/collect_reddit_user.go. This significant code duplication makes maintenance harder. Consider refactoring the common logic into a shared helper function. This function could take parameters like the scraper function (reddit.ScrapeSubreddit or reddit.ScrapeUser), the name of the entity being scraped (subreddit or user name), and the prefix for the datanode path (r- or u-).

Additionally, the markdown generation logic is duplicated across collect_reddit_subreddit.go, collect_reddit_thread.go, and collect_reddit_user.go. This could also be extracted into a helper function, for example, func threadToMarkdown(thread *reddit.Thread) []byte.

}
collectRedditSubredditCmd.PersistentFlags().String("output", "", "Output file for the DataNode")
collectRedditSubredditCmd.PersistentFlags().Int("limit", 100, "Number of posts to collect")
collectRedditSubredditCmd.PersistentFlags().String("sort", "top", "Sort order for posts (top, new)")
collectRedditSubredditCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
collectRedditSubredditCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
collectRedditSubredditCmd.PersistentFlags().String("password", "", "Password for encryption")
return collectRedditSubredditCmd
}
1 change: 1 addition & 0 deletions cmd/collect_reddit_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
package cmd
112 changes: 112 additions & 0 deletions cmd/collect_reddit_thread.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
package cmd

import (
"fmt"
"os"
"strings"

"github.com/Snider/Borg/pkg/compress"
"github.com/Snider/Borg/pkg/datanode"
"github.com/Snider/Borg/pkg/reddit"
"github.com/Snider/Borg/pkg/tim"
"github.com/Snider/Borg/pkg/trix"
"github.com/spf13/cobra"
)

// collectRedditThreadCmd represents the collect reddit thread command
var collectRedditThreadCmd = NewCollectRedditThreadCmd()

func init() {
GetCollectRedditCmd().AddCommand(GetCollectRedditThreadCmd())
}

func GetCollectRedditThreadCmd() *cobra.Command {
return collectRedditThreadCmd
}

func NewCollectRedditThreadCmd() *cobra.Command {
collectRedditThreadCmd := &cobra.Command{
Use: "thread [url]",
Short: "Collect a single Reddit thread",
Long: `Collect a single Reddit thread and store it in a DataNode.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
threadURL := args[0]
outputFile, _ := cmd.Flags().GetString("output")
format, _ := cmd.Flags().GetString("format")
compression, _ := cmd.Flags().GetString("compression")
password, _ := cmd.Flags().GetString("password")

if format != "datanode" && format != "tim" && format != "trix" {
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
}

thread, err := reddit.ScrapeThread(threadURL)
if err != nil {
return fmt.Errorf("failed to scrape thread: %w", err)
}

// Convert thread to Markdown
var builder strings.Builder
builder.WriteString(fmt.Sprintf("# %s\n\n", thread.Title))
builder.WriteString(fmt.Sprintf("%s\n\n", thread.Post))
for _, comment := range thread.Comments {
builder.WriteString(fmt.Sprintf("## %s\n\n", comment.Author))
builder.WriteString(fmt.Sprintf("%s\n\n", comment.Body))
}

dn := datanode.New()
err = dn.AddData("thread.md", []byte(builder.String()))

Check failure on line 59 in cmd/collect_reddit_thread.go

View workflow job for this annotation

GitHub Actions / build

dn.AddData("thread.md", []byte(builder.String())) (no value) used as value

Check failure on line 59 in cmd/collect_reddit_thread.go

View workflow job for this annotation

GitHub Actions / build

dn.AddData("thread.md", []byte(builder.String())) (no value) used as value
if err != nil {
return fmt.Errorf("error adding data to DataNode: %w", err)
}

var data []byte
if format == "tim" {
tim, err := tim.FromDataNode(dn)
if err != nil {
return fmt.Errorf("error creating tim: %w", err)
}
data, err = tim.ToTar()
if err != nil {
return fmt.Errorf("error serializing tim: %w", err)
}
} else if format == "trix" {
data, err = trix.ToTrix(dn, password)
if err != nil {
return fmt.Errorf("error serializing trix: %w", err)
}
} else {
data, err = dn.ToTar()
if err != nil {
return fmt.Errorf("error serializing DataNode: %w", err)
}
}

compressedData, err := compress.Compress(data, compression)
if err != nil {
return fmt.Errorf("error compressing data: %w", err)
}

if outputFile == "" {
outputFile = "thread." + format
if compression != "none" {
outputFile += "." + compression
}
}

err = os.WriteFile(outputFile, compressedData, 0644)
if err != nil {
return fmt.Errorf("error writing thread to file: %w", err)
}

fmt.Fprintln(cmd.OutOrStdout(), "Thread saved to", outputFile)
return nil
},
}
collectRedditThreadCmd.PersistentFlags().String("output", "", "Output file for the DataNode")
collectRedditThreadCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
collectRedditThreadCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
collectRedditThreadCmd.PersistentFlags().String("password", "", "Password for encryption")
return collectRedditThreadCmd
}
122 changes: 122 additions & 0 deletions cmd/collect_reddit_user.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package cmd

import (
"fmt"
"os"
"strings"

"github.com/Snider/Borg/pkg/compress"
"github.com/Snider/Borg/pkg/datanode"
"github.com/Snider/Borg/pkg/reddit"
"github.com/Snider/Borg/pkg/tim"
"github.com/Snider/Borg/pkg/trix"
"github.com/spf13/cobra"
)

// collectRedditUserCmd represents the collect reddit user command
var collectRedditUserCmd = NewCollectRedditUserCmd()

func init() {
GetCollectRedditCmd().AddCommand(GetCollectRedditUserCmd())
}

func GetCollectRedditUserCmd() *cobra.Command {
return collectRedditUserCmd
}

func NewCollectRedditUserCmd() *cobra.Command {
collectRedditUserCmd := &cobra.Command{
Use: "user [name]",
Short: "Collect a user's posts",
Long: `Collect a user's posts and store them in a DataNode.`,
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
userName := args[0]
outputFile, _ := cmd.Flags().GetString("output")
format, _ := cmd.Flags().GetString("format")
compression, _ := cmd.Flags().GetString("compression")
password, _ := cmd.Flags().GetString("password")

if format != "datanode" && format != "tim" && format != "trix" {
return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format)
}

threads, err := reddit.ScrapeUser(userName)
if err != nil {
return fmt.Errorf("failed to scrape user: %w", err)
}

dn := datanode.New()
for _, threadStub := range threads {
thread, err := reddit.ScrapeThread(threadStub.URL)
if err != nil {
// It's better to log the error and continue
fmt.Fprintf(cmd.ErrOrStderr(), "failed to scrape thread %s: %v\n", threadStub.URL, err)
continue
}

var builder strings.Builder
builder.WriteString(fmt.Sprintf("# %s\n\n", thread.Title))
builder.WriteString(fmt.Sprintf("%s\n\n", thread.Post))
for _, comment := range thread.Comments {
builder.WriteString(fmt.Sprintf("## %s\n\n", comment.Author))
builder.WriteString(fmt.Sprintf("%s\n\n", comment.Body))
}
// Sanitize filename
filename := strings.ReplaceAll(thread.Title, " ", "_")
filename = strings.ReplaceAll(filename, "/", "_")
err = dn.AddData(fmt.Sprintf("u-%s/posts/%s.md", userName, filename), []byte(builder.String()))

Check failure on line 68 in cmd/collect_reddit_user.go

View workflow job for this annotation

GitHub Actions / build

dn.AddData(fmt.Sprintf("u-%s/posts/%s.md", userName, filename), []byte(builder.String())) (no value) used as value

Check failure on line 68 in cmd/collect_reddit_user.go

View workflow job for this annotation

GitHub Actions / build

dn.AddData(fmt.Sprintf("u-%s/posts/%s.md", userName, filename), []byte(builder.String())) (no value) used as value
if err != nil {
return fmt.Errorf("error adding data to DataNode: %w", err)
}
}

var data []byte
if format == "tim" {
tim, err := tim.FromDataNode(dn)
if err != nil {
return fmt.Errorf("error creating tim: %w", err)
}
data, err = tim.ToTar()
if err != nil {
return fmt.Errorf("error serializing tim: %w", err)
}
} else if format == "trix" {
data, err = trix.ToTrix(dn, password)
if err != nil {
return fmt.Errorf("error serializing trix: %w", err)
}
} else {
data, err = dn.ToTar()
if err != nil {
return fmt.Errorf("error serializing DataNode: %w", err)
}
}

compressedData, err := compress.Compress(data, compression)
if err != nil {
return fmt.Errorf("error compressing data: %w", err)
}

if outputFile == "" {
outputFile = "user." + format
if compression != "none" {
outputFile += "." + compression
}
}

err = os.WriteFile(outputFile, compressedData, 0644)
if err != nil {
return fmt.Errorf("error writing user to file: %w", err)
}

fmt.Fprintln(cmd.OutOrStdout(), "User posts saved to", outputFile)
return nil
},
}
collectRedditUserCmd.PersistentFlags().String("output", "", "Output file for the DataNode")
collectRedditUserCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)")
collectRedditUserCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)")
collectRedditUserCmd.PersistentFlags().String("password", "", "Password for encryption")
return collectRedditUserCmd
}
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ require (
dario.cat/mergo v1.0.0 // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
github.com/ProtonMail/go-crypto v1.3.0 // indirect
github.com/PuerkitoBio/goquery v1.11.0 // indirect
github.com/andybalholm/cascadia v1.3.3 // indirect
github.com/bep/debounce v1.2.1 // indirect
github.com/cloudflare/circl v1.6.1 // indirect
github.com/cyphar/filepath-securejoin v0.4.1 // indirect
Expand Down
Loading
Loading