diff --git a/cmd/collect_reddit.go b/cmd/collect_reddit.go new file mode 100644 index 0000000..91ae0bd --- /dev/null +++ b/cmd/collect_reddit.go @@ -0,0 +1,24 @@ +package cmd + +import ( + "github.com/spf13/cobra" +) + +// collectRedditCmd represents the collect reddit command +var collectRedditCmd = NewCollectRedditCmd() + +func init() { + GetCollectCmd().AddCommand(GetCollectRedditCmd()) +} + +func NewCollectRedditCmd() *cobra.Command { + return &cobra.Command{ + Use: "reddit", + Short: "Collect a resource from Reddit.", + Long: `Collect a resource from Reddit and store it in a DataNode.`, + } +} + +func GetCollectRedditCmd() *cobra.Command { + return collectRedditCmd +} diff --git a/cmd/collect_reddit_subreddit.go b/cmd/collect_reddit_subreddit.go new file mode 100644 index 0000000..6b0141e --- /dev/null +++ b/cmd/collect_reddit_subreddit.go @@ -0,0 +1,126 @@ +package cmd + +import ( + "fmt" + "os" + "strings" + + "github.com/Snider/Borg/pkg/compress" + "github.com/Snider/Borg/pkg/datanode" + "github.com/Snider/Borg/pkg/reddit" + "github.com/Snider/Borg/pkg/tim" + "github.com/Snider/Borg/pkg/trix" + "github.com/spf13/cobra" +) + +// collectRedditSubredditCmd represents the collect reddit subreddit command +var collectRedditSubredditCmd = NewCollectRedditSubredditCmd() + +func init() { + GetCollectRedditCmd().AddCommand(GetCollectRedditSubredditCmd()) +} + +func GetCollectRedditSubredditCmd() *cobra.Command { + return collectRedditSubredditCmd +} + +func NewCollectRedditSubredditCmd() *cobra.Command { + collectRedditSubredditCmd := &cobra.Command{ + Use: "subreddit [name]", + Short: "Collect a subreddit's top posts", + Long: `Collect a subreddit's top posts and store them in a DataNode.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + subredditName := args[0] + outputFile, _ := cmd.Flags().GetString("output") + limit, _ := cmd.Flags().GetInt("limit") + sort, _ := cmd.Flags().GetString("sort") + format, _ := cmd.Flags().GetString("format") + compression, _ := cmd.Flags().GetString("compression") + password, _ := cmd.Flags().GetString("password") + + if format != "datanode" && format != "tim" && format != "trix" { + return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format) + } + + threads, err := reddit.ScrapeSubreddit(subredditName, sort, limit) + if err != nil { + return fmt.Errorf("failed to scrape subreddit: %w", err) + } + + dn := datanode.New() + for _, threadStub := range threads { + thread, err := reddit.ScrapeThread(threadStub.URL) + if err != nil { + // It's better to log the error and continue + fmt.Fprintf(cmd.ErrOrStderr(), "failed to scrape thread %s: %v\n", threadStub.URL, err) + continue + } + + var builder strings.Builder + builder.WriteString(fmt.Sprintf("# %s\n\n", thread.Title)) + builder.WriteString(fmt.Sprintf("%s\n\n", thread.Post)) + for _, comment := range thread.Comments { + builder.WriteString(fmt.Sprintf("## %s\n\n", comment.Author)) + builder.WriteString(fmt.Sprintf("%s\n\n", comment.Body)) + } + // Sanitize filename + filename := strings.ReplaceAll(thread.Title, " ", "_") + filename = strings.ReplaceAll(filename, "/", "_") + err = dn.AddData(fmt.Sprintf("r-%s/posts/%s.md", subredditName, filename), []byte(builder.String())) + if err != nil { + return fmt.Errorf("error adding data to DataNode: %w", err) + } + } + + var data []byte + if format == "tim" { + tim, err := tim.FromDataNode(dn) + if err != nil { + return fmt.Errorf("error creating tim: %w", err) + } + data, err = tim.ToTar() + if err != nil { + return fmt.Errorf("error serializing tim: %w", err) + } + } else if format == "trix" { + data, err = trix.ToTrix(dn, password) + if err != nil { + return fmt.Errorf("error serializing trix: %w", err) + } + } else { + data, err = dn.ToTar() + if err != nil { + return fmt.Errorf("error serializing DataNode: %w", err) + } + } + + compressedData, err := compress.Compress(data, compression) + if err != nil { + return fmt.Errorf("error compressing data: %w", err) + } + + if outputFile == "" { + outputFile = "subreddit." + format + if compression != "none" { + outputFile += "." + compression + } + } + + err = os.WriteFile(outputFile, compressedData, 0644) + if err != nil { + return fmt.Errorf("error writing subreddit to file: %w", err) + } + + fmt.Fprintln(cmd.OutOrStdout(), "Subreddit saved to", outputFile) + return nil + }, + } + collectRedditSubredditCmd.PersistentFlags().String("output", "", "Output file for the DataNode") + collectRedditSubredditCmd.PersistentFlags().Int("limit", 100, "Number of posts to collect") + collectRedditSubredditCmd.PersistentFlags().String("sort", "top", "Sort order for posts (top, new)") + collectRedditSubredditCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)") + collectRedditSubredditCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)") + collectRedditSubredditCmd.PersistentFlags().String("password", "", "Password for encryption") + return collectRedditSubredditCmd +} diff --git a/cmd/collect_reddit_test.go b/cmd/collect_reddit_test.go new file mode 100644 index 0000000..0c0256b --- /dev/null +++ b/cmd/collect_reddit_test.go @@ -0,0 +1 @@ +package cmd \ No newline at end of file diff --git a/cmd/collect_reddit_thread.go b/cmd/collect_reddit_thread.go new file mode 100644 index 0000000..42c921a --- /dev/null +++ b/cmd/collect_reddit_thread.go @@ -0,0 +1,112 @@ +package cmd + +import ( + "fmt" + "os" + "strings" + + "github.com/Snider/Borg/pkg/compress" + "github.com/Snider/Borg/pkg/datanode" + "github.com/Snider/Borg/pkg/reddit" + "github.com/Snider/Borg/pkg/tim" + "github.com/Snider/Borg/pkg/trix" + "github.com/spf13/cobra" +) + +// collectRedditThreadCmd represents the collect reddit thread command +var collectRedditThreadCmd = NewCollectRedditThreadCmd() + +func init() { + GetCollectRedditCmd().AddCommand(GetCollectRedditThreadCmd()) +} + +func GetCollectRedditThreadCmd() *cobra.Command { + return collectRedditThreadCmd +} + +func NewCollectRedditThreadCmd() *cobra.Command { + collectRedditThreadCmd := &cobra.Command{ + Use: "thread [url]", + Short: "Collect a single Reddit thread", + Long: `Collect a single Reddit thread and store it in a DataNode.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + threadURL := args[0] + outputFile, _ := cmd.Flags().GetString("output") + format, _ := cmd.Flags().GetString("format") + compression, _ := cmd.Flags().GetString("compression") + password, _ := cmd.Flags().GetString("password") + + if format != "datanode" && format != "tim" && format != "trix" { + return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format) + } + + thread, err := reddit.ScrapeThread(threadURL) + if err != nil { + return fmt.Errorf("failed to scrape thread: %w", err) + } + + // Convert thread to Markdown + var builder strings.Builder + builder.WriteString(fmt.Sprintf("# %s\n\n", thread.Title)) + builder.WriteString(fmt.Sprintf("%s\n\n", thread.Post)) + for _, comment := range thread.Comments { + builder.WriteString(fmt.Sprintf("## %s\n\n", comment.Author)) + builder.WriteString(fmt.Sprintf("%s\n\n", comment.Body)) + } + + dn := datanode.New() + err = dn.AddData("thread.md", []byte(builder.String())) + if err != nil { + return fmt.Errorf("error adding data to DataNode: %w", err) + } + + var data []byte + if format == "tim" { + tim, err := tim.FromDataNode(dn) + if err != nil { + return fmt.Errorf("error creating tim: %w", err) + } + data, err = tim.ToTar() + if err != nil { + return fmt.Errorf("error serializing tim: %w", err) + } + } else if format == "trix" { + data, err = trix.ToTrix(dn, password) + if err != nil { + return fmt.Errorf("error serializing trix: %w", err) + } + } else { + data, err = dn.ToTar() + if err != nil { + return fmt.Errorf("error serializing DataNode: %w", err) + } + } + + compressedData, err := compress.Compress(data, compression) + if err != nil { + return fmt.Errorf("error compressing data: %w", err) + } + + if outputFile == "" { + outputFile = "thread." + format + if compression != "none" { + outputFile += "." + compression + } + } + + err = os.WriteFile(outputFile, compressedData, 0644) + if err != nil { + return fmt.Errorf("error writing thread to file: %w", err) + } + + fmt.Fprintln(cmd.OutOrStdout(), "Thread saved to", outputFile) + return nil + }, + } + collectRedditThreadCmd.PersistentFlags().String("output", "", "Output file for the DataNode") + collectRedditThreadCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)") + collectRedditThreadCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)") + collectRedditThreadCmd.PersistentFlags().String("password", "", "Password for encryption") + return collectRedditThreadCmd +} diff --git a/cmd/collect_reddit_user.go b/cmd/collect_reddit_user.go new file mode 100644 index 0000000..2a3af4b --- /dev/null +++ b/cmd/collect_reddit_user.go @@ -0,0 +1,122 @@ +package cmd + +import ( + "fmt" + "os" + "strings" + + "github.com/Snider/Borg/pkg/compress" + "github.com/Snider/Borg/pkg/datanode" + "github.com/Snider/Borg/pkg/reddit" + "github.com/Snider/Borg/pkg/tim" + "github.com/Snider/Borg/pkg/trix" + "github.com/spf13/cobra" +) + +// collectRedditUserCmd represents the collect reddit user command +var collectRedditUserCmd = NewCollectRedditUserCmd() + +func init() { + GetCollectRedditCmd().AddCommand(GetCollectRedditUserCmd()) +} + +func GetCollectRedditUserCmd() *cobra.Command { + return collectRedditUserCmd +} + +func NewCollectRedditUserCmd() *cobra.Command { + collectRedditUserCmd := &cobra.Command{ + Use: "user [name]", + Short: "Collect a user's posts", + Long: `Collect a user's posts and store them in a DataNode.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + userName := args[0] + outputFile, _ := cmd.Flags().GetString("output") + format, _ := cmd.Flags().GetString("format") + compression, _ := cmd.Flags().GetString("compression") + password, _ := cmd.Flags().GetString("password") + + if format != "datanode" && format != "tim" && format != "trix" { + return fmt.Errorf("invalid format: %s (must be 'datanode', 'tim', or 'trix')", format) + } + + threads, err := reddit.ScrapeUser(userName) + if err != nil { + return fmt.Errorf("failed to scrape user: %w", err) + } + + dn := datanode.New() + for _, threadStub := range threads { + thread, err := reddit.ScrapeThread(threadStub.URL) + if err != nil { + // It's better to log the error and continue + fmt.Fprintf(cmd.ErrOrStderr(), "failed to scrape thread %s: %v\n", threadStub.URL, err) + continue + } + + var builder strings.Builder + builder.WriteString(fmt.Sprintf("# %s\n\n", thread.Title)) + builder.WriteString(fmt.Sprintf("%s\n\n", thread.Post)) + for _, comment := range thread.Comments { + builder.WriteString(fmt.Sprintf("## %s\n\n", comment.Author)) + builder.WriteString(fmt.Sprintf("%s\n\n", comment.Body)) + } + // Sanitize filename + filename := strings.ReplaceAll(thread.Title, " ", "_") + filename = strings.ReplaceAll(filename, "/", "_") + err = dn.AddData(fmt.Sprintf("u-%s/posts/%s.md", userName, filename), []byte(builder.String())) + if err != nil { + return fmt.Errorf("error adding data to DataNode: %w", err) + } + } + + var data []byte + if format == "tim" { + tim, err := tim.FromDataNode(dn) + if err != nil { + return fmt.Errorf("error creating tim: %w", err) + } + data, err = tim.ToTar() + if err != nil { + return fmt.Errorf("error serializing tim: %w", err) + } + } else if format == "trix" { + data, err = trix.ToTrix(dn, password) + if err != nil { + return fmt.Errorf("error serializing trix: %w", err) + } + } else { + data, err = dn.ToTar() + if err != nil { + return fmt.Errorf("error serializing DataNode: %w", err) + } + } + + compressedData, err := compress.Compress(data, compression) + if err != nil { + return fmt.Errorf("error compressing data: %w", err) + } + + if outputFile == "" { + outputFile = "user." + format + if compression != "none" { + outputFile += "." + compression + } + } + + err = os.WriteFile(outputFile, compressedData, 0644) + if err != nil { + return fmt.Errorf("error writing user to file: %w", err) + } + + fmt.Fprintln(cmd.OutOrStdout(), "User posts saved to", outputFile) + return nil + }, + } + collectRedditUserCmd.PersistentFlags().String("output", "", "Output file for the DataNode") + collectRedditUserCmd.PersistentFlags().String("format", "datanode", "Output format (datanode, tim, or trix)") + collectRedditUserCmd.PersistentFlags().String("compression", "none", "Compression format (none, gz, or xz)") + collectRedditUserCmd.PersistentFlags().String("password", "", "Password for encryption") + return collectRedditUserCmd +} diff --git a/go.mod b/go.mod index d1c5f08..3e08aed 100644 --- a/go.mod +++ b/go.mod @@ -22,6 +22,8 @@ require ( dario.cat/mergo v1.0.0 // indirect github.com/Microsoft/go-winio v0.6.2 // indirect github.com/ProtonMail/go-crypto v1.3.0 // indirect + github.com/PuerkitoBio/goquery v1.11.0 // indirect + github.com/andybalholm/cascadia v1.3.3 // indirect github.com/bep/debounce v1.2.1 // indirect github.com/cloudflare/circl v1.6.1 // indirect github.com/cyphar/filepath-securejoin v0.4.1 // indirect diff --git a/go.sum b/go.sum index 2a41157..5702f88 100644 --- a/go.sum +++ b/go.sum @@ -5,8 +5,12 @@ github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERo github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/ProtonMail/go-crypto v1.3.0 h1:ILq8+Sf5If5DCpHQp4PbZdS1J7HDFRXz/+xKBiRGFrw= github.com/ProtonMail/go-crypto v1.3.0/go.mod h1:9whxjD8Rbs29b4XWbB8irEcE8KHMqaR2e7GWU1R+/PE= +github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw= +github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ= github.com/Snider/Enchantrix v0.0.2 h1:ExZQiBhfS/p/AHFTKhY80TOd+BXZjK95EzByAEgwvjs= github.com/Snider/Enchantrix v0.0.2/go.mod h1:CtFcLAvnDT1KcuF1JBb/DJj0KplY8jHryO06KzQ1hsQ= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be/go.mod h1:ySMOLuWl6zY27l47sB3qLNK6tF2fkHG55UZxx8oIVo4= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= @@ -49,6 +53,7 @@ github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/go-github/v39 v39.2.0 h1:rNNM311XtPOz5rDdsJXAp2o8F67X9FnROXTvto3aSnQ= @@ -152,24 +157,50 @@ github.com/wailsapp/wails/v2 v2.11.0 h1:seLacV8pqupq32IjS4Y7V8ucab0WZwtK6VvUVxSB github.com/wailsapp/wails/v2 v2.11.0/go.mod h1:jrf0ZaM6+GBc1wRmXsM8cIvzlg0karYin3erahI4+0k= github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM= github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= golang.org/x/crypto v0.44.0 h1:A97SsFvM3AIwEEmTBiaxPPTYpDC47w720rdiiUvgoAU= golang.org/x/crypto v0.44.0/go.mod h1:013i+Nw79BMiQiMsOPcVCB5ZIJbYkerPrGnOa00tvmc= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk= golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc= golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210505024714-0287a6fb4125/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.33.0 h1:4Q+qn+E5z8gPRJfmRy7C2gGG3T4jIprK6aSYgTXGRpo= golang.org/x/oauth2 v0.33.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200810151505-1b9f1253b3ed/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -177,22 +208,51 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/pkg/reddit/reddit.go b/pkg/reddit/reddit.go new file mode 100644 index 0000000..bec1e81 --- /dev/null +++ b/pkg/reddit/reddit.go @@ -0,0 +1,131 @@ +package reddit + +import ( + "fmt" + "net/http" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +// Comment represents a single Reddit comment. +type Comment struct { + Author string + Body string +} + +// Thread represents a Reddit thread, including the original post and all comments. +type Thread struct { + Title string + Post string + Comments []Comment + URL string +} + +// ScrapeThread fetches and parses a Reddit thread from a given URL. +func ScrapeThread(url string) (*Thread, error) { + // Make sure we're using old.reddit.com for simpler scraping + if !strings.Contains(url, "old.reddit.com") { + url = strings.Replace(url, "reddit.com", "old.reddit.com", 1) + } + + res, err := http.Get(url) + if err != nil { + return nil, fmt.Errorf("failed to fetch URL: %w", err) + } + defer res.Body.Close() + + if res.StatusCode != 200 { + return nil, fmt.Errorf("request failed with status: %s", res.Status) + } + + doc, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + return nil, fmt.Errorf("failed to parse HTML: %w", err) + } + + thread := &Thread{} + + // Scrape the post title and content + thread.Title = doc.Find("a.title").First().Text() + thread.Post = doc.Find("div.expando .md").First().Text() + + // Scrape comments + doc.Find(".commentarea .comment").Each(func(i int, s *goquery.Selection) { + author := s.Find(".author").First().Text() + body := s.Find(".md").First().Text() + thread.Comments = append(thread.Comments, Comment{Author: author, Body: body}) + }) + + return thread, nil +} + +// ScrapeSubreddit fetches and parses a subreddit's posts. +func ScrapeSubreddit(name, sort string, limit int) ([]*Thread, error) { + url := fmt.Sprintf("https://old.reddit.com/r/%s/", name) + if sort == "top" { + url = fmt.Sprintf("https://old.reddit.com/r/%s/top/?t=all", name) + } + + res, err := http.Get(url) + if err != nil { + return nil, fmt.Errorf("failed to fetch URL: %w", err) + } + defer res.Body.Close() + + if res.StatusCode != 200 { + return nil, fmt.Errorf("request failed with status: %s", res.Status) + } + + doc, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + return nil, fmt.Errorf("failed to parse HTML: %w", err) + } + + var threads []*Thread + doc.Find("div.thing.link").Each(func(i int, s *goquery.Selection) { + if i >= limit { + return + } + title := s.Find("a.title").Text() + postURL, _ := s.Find("a.title").Attr("href") + if !strings.HasPrefix(postURL, "http") { + postURL = "https://old.reddit.com" + postURL + } + threads = append(threads, &Thread{Title: title, URL: postURL}) + }) + + return threads, nil +} + +// ScrapeUser fetches and parses a user's posts. +func ScrapeUser(name string) ([]*Thread, error) { + url := fmt.Sprintf("https://old.reddit.com/user/%s/", name) + + res, err := http.Get(url) + if err != nil { + return nil, fmt.Errorf("failed to fetch URL: %w", err) + } + defer res.Body.Close() + + if res.StatusCode != 200 { + return nil, fmt.Errorf("request failed with status: %s", res.Status) + } + + doc, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + return nil, fmt.Errorf("failed to parse HTML: %w", err) + } + + var threads []*Thread + doc.Find("div.thing.link").Each(func(i int, s *goquery.Selection) { + title := s.Find("a.title").Text() + postURL, _ := s.Find("a.title").Attr("href") + if !strings.HasPrefix(postURL, "http") { + postURL = "https://old.reddit.com" + postURL + } + threads = append(threads, &Thread{Title: title, URL: postURL}) + }) + + return threads, nil +} diff --git a/pkg/reddit/reddit_test.go b/pkg/reddit/reddit_test.go new file mode 100644 index 0000000..5aa6020 --- /dev/null +++ b/pkg/reddit/reddit_test.go @@ -0,0 +1 @@ +package reddit \ No newline at end of file