diff --git a/README.md b/README.md index b80e06b..3f2332b 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,9 @@ borg collect github repo # Clone repository borg collect github repos # Clone all repos from user/org borg collect website --depth 2 # Crawl website borg collect pwa --uri # Download PWA +borg collect npm # Collect npm package +borg collect cargo # Collect cargo crate +borg collect go # Collect Go module # Compilation borg compile -f Borgfile -o out.tim # Plain TIM diff --git a/cmd/collect_cargo.go b/cmd/collect_cargo.go new file mode 100644 index 0000000..eba655c --- /dev/null +++ b/cmd/collect_cargo.go @@ -0,0 +1,61 @@ +package cmd + +import ( + "fmt" + "os" + + "github.com/Snider/Borg/pkg/collect" + "github.com/spf13/cobra" +) + +// collectCargoCmd represents the collect cargo command +var collectCargoCmd = NewCollectCargoCmd() + +func init() { + GetCollectCmd().AddCommand(GetCollectCargoCmd()) +} + +func GetCollectCargoCmd() *cobra.Command { + return collectCargoCmd +} + +func NewCollectCargoCmd() *cobra.Command { + collectCargoCmd := &cobra.Command{ + Use: "cargo [package]", + Short: "Collect a single cargo package", + Long: `Collect a single cargo package and store it in a DataNode.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + packageName := args[0] + outputFile, err := cmd.Flags().GetString("output") + if err != nil { + return fmt.Errorf("could not get output flag: %w", err) + } + + collector := collect.NewCargoCollector() + dn, err := collector.Collect(packageName) + if err != nil { + return fmt.Errorf("error collecting cargo package: %w", err) + } + + data, err := dn.ToTar() + if err != nil { + return fmt.Errorf("error serializing DataNode: %w", err) + } + + if outputFile == "" { + outputFile = packageName + ".dat" + } + + err = os.WriteFile(outputFile, data, 0644) + if err != nil { + return fmt.Errorf("error writing cargo package to file: %w", err) + } + + fmt.Fprintln(cmd.OutOrStdout(), "Cargo package saved to", outputFile) + return nil + }, + } + collectCargoCmd.PersistentFlags().String("output", "", "Output file for the DataNode") + return collectCargoCmd +} diff --git a/cmd/collect_go.go b/cmd/collect_go.go new file mode 100644 index 0000000..5b88553 --- /dev/null +++ b/cmd/collect_go.go @@ -0,0 +1,61 @@ +package cmd + +import ( + "fmt" + "os" + + "github.com/Snider/Borg/pkg/collect" + "github.com/spf13/cobra" +) + +// collectGoCmd represents the collect go command +var collectGoCmd = NewCollectGoCmd() + +func init() { + GetCollectCmd().AddCommand(GetCollectGoCmd()) +} + +func GetCollectGoCmd() *cobra.Command { + return collectGoCmd +} + +func NewCollectGoCmd() *cobra.Command { + collectGoCmd := &cobra.Command{ + Use: "go [module]", + Short: "Collect a single Go module", + Long: `Collect a single Go module and store it in a DataNode.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + modulePath := args[0] + outputFile, err := cmd.Flags().GetString("output") + if err != nil { + return fmt.Errorf("could not get output flag: %w", err) + } + + collector := collect.NewGoCollector() + dn, err := collector.Collect(modulePath) + if err != nil { + return fmt.Errorf("error collecting go module: %w", err) + } + + data, err := dn.ToTar() + if err != nil { + return fmt.Errorf("error serializing DataNode: %w", err) + } + + if outputFile == "" { + outputFile = modulePath + ".dat" + } + + err = os.WriteFile(outputFile, data, 0644) + if err != nil { + return fmt.Errorf("error writing go module to file: %w", err) + } + + fmt.Fprintln(cmd.OutOrStdout(), "Go module saved to", outputFile) + return nil + }, + } + collectGoCmd.PersistentFlags().String("output", "", "Output file for the DataNode") + return collectGoCmd +} diff --git a/cmd/collect_npm.go b/cmd/collect_npm.go new file mode 100644 index 0000000..411f7b5 --- /dev/null +++ b/cmd/collect_npm.go @@ -0,0 +1,61 @@ +package cmd + +import ( + "fmt" + "os" + + "github.com/Snider/Borg/pkg/collect" + "github.com/spf13/cobra" +) + +// collectNpmCmd represents the collect npm command +var collectNpmCmd = NewCollectNpmCmd() + +func init() { + GetCollectCmd().AddCommand(GetCollectNpmCmd()) +} + +func GetCollectNpmCmd() *cobra.Command { + return collectNpmCmd +} + +func NewCollectNpmCmd() *cobra.Command { + collectNpmCmd := &cobra.Command{ + Use: "npm [package]", + Short: "Collect a single npm package", + Long: `Collect a single npm package and store it in a DataNode.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + packageName := args[0] + outputFile, err := cmd.Flags().GetString("output") + if err != nil { + return fmt.Errorf("could not get output flag: %w", err) + } + + collector := collect.NewNPMCollector() + dn, err := collector.Collect(packageName) + if err != nil { + return fmt.Errorf("error collecting npm package: %w", err) + } + + data, err := dn.ToTar() + if err != nil { + return fmt.Errorf("error serializing DataNode: %w", err) + } + + if outputFile == "" { + outputFile = packageName + ".dat" + } + + err = os.WriteFile(outputFile, data, 0644) + if err != nil { + return fmt.Errorf("error writing npm package to file: %w", err) + } + + fmt.Fprintln(cmd.OutOrStdout(), "NPM package saved to", outputFile) + return nil + }, + } + collectNpmCmd.PersistentFlags().String("output", "", "Output file for the DataNode") + return collectNpmCmd +} diff --git a/docs/cli.md b/docs/cli.md index 55c0185..2801769 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -21,11 +21,17 @@ Subcommands: - `borg collect github repos [--output ] [--format ...] [--compression ...]` - `borg collect website [--depth N] [--output ] [--format ...] [--compression ...]` - `borg collect pwa --uri [--output ] [--format ...] [--compression ...]` +- `borg collect npm [--output ]` +- `borg collect cargo [--output ]` +- `borg collect go [--output ]` Examples: - `borg collect github repo https://github.com/Snider/Borg --output borg.dat` - `borg collect website https://example.com --depth 1 --output site.dat` - `borg collect pwa --uri https://squoosh.app --output squoosh.dat` +- `borg collect npm @angular/cli --output angular-cli.dat` +- `borg collect cargo serde --output serde.dat` +- `borg collect go golang.org/x/text --output go-text.dat` ### all diff --git a/pkg/collect/cargo.go b/pkg/collect/cargo.go new file mode 100644 index 0000000..b68ca4e --- /dev/null +++ b/pkg/collect/cargo.go @@ -0,0 +1,114 @@ +package collect + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + + "github.com/Snider/Borg/pkg/datanode" +) + +// CargoRegistryURL is the base URL for the cargo registry. +const CargoRegistryURL = "https://crates.io/api/v1" + +// CargoCollector is a collector for cargo packages. +type CargoCollector struct { + client *http.Client +} + +// NewCargoCollector creates a new CargoCollector. +func NewCargoCollector() *CargoCollector { + return &CargoCollector{ + client: &http.Client{}, + } +} + +// Collect fetches a cargo package and returns a DataNode. +func (c *CargoCollector) Collect(crateName string) (*datanode.DataNode, error) { + meta, err := c.fetchCrateMetadata(crateName) + if err != nil { + return nil, fmt.Errorf("could not fetch crate metadata: %w", err) + } + + dn := datanode.New() + metadata, err := json.MarshalIndent(meta, "", " ") + if err != nil { + return nil, fmt.Errorf("could not marshal metadata: %w", err) + } + dn.AddData("metadata.json", metadata) + + for _, version := range meta.Versions { + if err := c.fetchAndAddCrate(dn, version.DlPath, version.Num+".crate"); err != nil { + return nil, fmt.Errorf("could not fetch crate for version %s: %w", version.Num, err) + } + } + + return dn, nil +} + +func (c *CargoCollector) fetchCrateMetadata(crateName string) (*CargoCrate, error) { + req, err := http.NewRequest("GET", fmt.Sprintf("%s/crates/%s", CargoRegistryURL, crateName), nil) + if err != nil { + return nil, err + } + req.Header.Set("User-Agent", "git/oxide-0.38.0") + + resp, err := c.client.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("bad status: %s", resp.Status) + } + + var crate CargoCrate + if err := json.NewDecoder(resp.Body).Decode(&crate); err != nil { + return nil, err + } + return &crate, nil +} + +func (c *CargoCollector) fetchAndAddCrate(dn *datanode.DataNode, downloadURL, filename string) error { + req, err := http.NewRequest("GET", fmt.Sprintf("https://crates.io%s", downloadURL), nil) + if err != nil { + return err + } + req.Header.Set("User-Agent", "git/oxide-0.38.0") + + resp, err := c.client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("bad status: %s", resp.Status) + } + + data, err := io.ReadAll(resp.Body) + if err != nil { + return err + } + dn.AddData(filename, data) + return nil +} + +// CargoCrate represents the metadata for a cargo crate. +type CargoCrate struct { + Crate CargoCrateData `json:"crate"` + Versions []CargoVersionData `json:"versions"` +} + +// CargoCrateData represents the metadata for a cargo crate. +type CargoCrateData struct { + Name string `json:"name"` +} + +// CargoVersionData represents the metadata for a specific version of a cargo crate. +type CargoVersionData struct { + Num string `json:"num"` + DlPath string `json:"dl_path"` +} diff --git a/pkg/collect/cargo_test.go b/pkg/collect/cargo_test.go new file mode 100644 index 0000000..6f8f0b9 --- /dev/null +++ b/pkg/collect/cargo_test.go @@ -0,0 +1,50 @@ +package collect + +import ( + "bytes" + "io" + "net/http" + "strings" + "testing" +) + +func TestCargoCollector_Collect(t *testing.T) { + client := &http.Client{ + Transport: &mockHTTPClient{ + responses: map[string]*http.Response{ + "https://crates.io/api/v1/crates/monero-rs": { + StatusCode: http.StatusOK, + Body: io.NopCloser(strings.NewReader(`{ + "crate": { + "name": "monero-rs" + }, + "versions": [ + { + "num": "0.1.0", + "dl_path": "/api/v1/crates/monero-rs/0.1.0/download" + } + ] + }`)), + }, + "https://crates.io/api/v1/crates/monero-rs/0.1.0/download": { + StatusCode: http.StatusOK, + Body: io.NopCloser(bytes.NewReader([]byte("crate content"))), + }, + }, + }, + } + + collector := &CargoCollector{client: client} + dn, err := collector.Collect("monero-rs") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if _, err := dn.Stat("metadata.json"); err != nil { + t.Errorf("expected metadata.json to exist") + } + + if _, err := dn.Stat("0.1.0.crate"); err != nil { + t.Errorf("expected 0.1.0.crate to exist") + } +} diff --git a/pkg/collect/go.go b/pkg/collect/go.go new file mode 100644 index 0000000..d4c0e6c --- /dev/null +++ b/pkg/collect/go.go @@ -0,0 +1,81 @@ +package collect + +import ( + "fmt" + "io" + "net/http" + "strings" + + "github.com/Snider/Borg/pkg/datanode" +) + +// GoProxyURL is the base URL for the Go module proxy. +const GoProxyURL = "https://proxy.golang.org" + +// GoCollector is a collector for Go modules. +type GoCollector struct { + client *http.Client +} + +// NewGoCollector creates a new GoCollector. +func NewGoCollector() *GoCollector { + return &GoCollector{ + client: http.DefaultClient, + } +} + +// Collect fetches a Go module and returns a DataNode. +func (c *GoCollector) Collect(modulePath string) (*datanode.DataNode, error) { + versions, err := c.fetchModuleVersions(modulePath) + if err != nil { + return nil, fmt.Errorf("could not fetch module versions: %w", err) + } + + dn := datanode.New() + for _, version := range versions { + if err := c.fetchAndAddSource(dn, modulePath, version); err != nil { + return nil, fmt.Errorf("could not fetch source for version %s: %w", version, err) + } + } + + return dn, nil +} + +func (c *GoCollector) fetchModuleVersions(modulePath string) ([]string, error) { + resp, err := c.client.Get(fmt.Sprintf("%s/%s/@v/list", GoProxyURL, modulePath)) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("bad status: %s", resp.Status) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + return strings.Split(string(body), "\n"), nil +} + +func (c *GoCollector) fetchAndAddSource(dn *datanode.DataNode, modulePath, version string) error { + resp, err := c.client.Get(fmt.Sprintf("%s/%s/@v/%s.zip", GoProxyURL, modulePath, version)) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("bad status: %s", resp.Status) + } + + data, err := io.ReadAll(resp.Body) + if err != nil { + return err + } + + dn.AddData(version+".zip", data) + return nil +} diff --git a/pkg/collect/go_test.go b/pkg/collect/go_test.go new file mode 100644 index 0000000..2bb4bb0 --- /dev/null +++ b/pkg/collect/go_test.go @@ -0,0 +1,52 @@ +package collect + +import ( + "bytes" + "io" + "net/http" + "strings" + "testing" +) + +type mockGoHTTPClient struct { + responses map[string]*http.Response +} + +func (c *mockGoHTTPClient) RoundTrip(req *http.Request) (*http.Response, error) { + return c.responses[req.URL.String()], nil +} + +func TestGoCollector_Collect(t *testing.T) { + client := &http.Client{ + Transport: &mockGoHTTPClient{ + responses: map[string]*http.Response{ + "https://proxy.golang.org/github.com/monero-ecosystem/go-monero/@v/list": { + StatusCode: http.StatusOK, + Body: io.NopCloser(strings.NewReader("v0.1.0\nv0.2.0")), + }, + "https://proxy.golang.org/github.com/monero-ecosystem/go-monero/@v/v0.1.0.zip": { + StatusCode: http.StatusOK, + Body: io.NopCloser(bytes.NewReader([]byte("zip content v0.1.0"))), + }, + "https://proxy.golang.org/github.com/monero-ecosystem/go-monero/@v/v0.2.0.zip": { + StatusCode: http.StatusOK, + Body: io.NopCloser(bytes.NewReader([]byte("zip content v0.2.0"))), + }, + }, + }, + } + + collector := &GoCollector{client: client} + dn, err := collector.Collect("github.com/monero-ecosystem/go-monero") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if _, err := dn.Stat("v0.1.0.zip"); err != nil { + t.Errorf("expected v0.1.0.zip to exist") + } + + if _, err := dn.Stat("v0.2.0.zip"); err != nil { + t.Errorf("expected v0.2.0.zip to exist") + } +} diff --git a/pkg/collect/npm.go b/pkg/collect/npm.go new file mode 100644 index 0000000..e4c16e9 --- /dev/null +++ b/pkg/collect/npm.go @@ -0,0 +1,104 @@ +package collect + +import ( + "encoding/json" + "fmt" + "io" + "log" + "net/http" + + "github.com/Snider/Borg/pkg/datanode" +) + +// NPMRegistryURL is the base URL for the npm registry. +const NPMRegistryURL = "https://registry.npmjs.org" + +// NPMCollector is a collector for npm packages. +type NPMCollector struct { + client *http.Client +} + +// NewNPMCollector creates a new NPMCollector. +func NewNPMCollector() *NPMCollector { + return &NPMCollector{ + client: http.DefaultClient, + } +} + +// Collect fetches an npm package and returns a DataNode. +func (c *NPMCollector) Collect(packageName string) (*datanode.DataNode, error) { + meta, err := c.fetchPackageMetadata(packageName) + if err != nil { + return nil, fmt.Errorf("could not fetch package metadata: %w", err) + } + + dn := datanode.New() + metadata, err := json.MarshalIndent(meta, "", " ") + if err != nil { + return nil, fmt.Errorf("could not marshal metadata: %w", err) + } + dn.AddData("metadata.json", metadata) + + for version, data := range meta.Versions { + if err := c.fetchAndAddTarball(dn, data.Dist.Tarball, version+".tgz"); err != nil { + // It is a valid use case to only collect metadata + log.Printf("could not fetch tarball for version %s: %v", version, err) + } + } + + return dn, nil +} + +func (c *NPMCollector) fetchAndAddTarball(dn *datanode.DataNode, url, filename string) error { + resp, err := c.client.Get(url) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("bad status: %s", resp.Status) + } + + data, err := c.readBody(resp.Body) + if err != nil { + return err + } + dn.AddData(filename, data) + return nil +} + +func (c *NPMCollector) fetchPackageMetadata(packageName string) (*NPMPackage, error) { + resp, err := c.client.Get(fmt.Sprintf("%s/%s", NPMRegistryURL, packageName)) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("bad status: %s", resp.Status) + } + + var pkg NPMPackage + if err := json.NewDecoder(resp.Body).Decode(&pkg); err != nil { + return nil, err + } + return &pkg, nil +} + +func (c *NPMCollector) readBody(body io.Reader) ([]byte, error) { + return io.ReadAll(body) +} + +// NPMPackage represents the metadata for an npm package. +type NPMPackage struct { + Name string `json:"name"` + Versions map[string]NPMVersionData `json:"versions"` +} + +// NPMVersionData represents the metadata for a specific version of an npm package. +type NPMVersionData struct { + Dist struct { + Tarball string `json:"tarball"` + } `json:"dist"` +} diff --git a/pkg/collect/npm_test.go b/pkg/collect/npm_test.go new file mode 100644 index 0000000..63d490d --- /dev/null +++ b/pkg/collect/npm_test.go @@ -0,0 +1,57 @@ +package collect + +import ( + "bytes" + "io" + "net/http" + "strings" + "testing" +) + +type mockHTTPClient struct { + responses map[string]*http.Response +} + +func (c *mockHTTPClient) RoundTrip(req *http.Request) (*http.Response, error) { + return c.responses[req.URL.String()], nil +} + +func TestNPMCollector_Collect(t *testing.T) { + client := &http.Client{ + Transport: &mockHTTPClient{ + responses: map[string]*http.Response{ + "https://registry.npmjs.org/@monero-project/monero-ts": { + StatusCode: http.StatusOK, + Body: io.NopCloser(strings.NewReader(`{ + "name": "@monero-project/monero-ts", + "versions": { + "1.0.0": { + "dist": { + "tarball": "https://registry.npmjs.org/@monero-project/monero-ts/-/monero-ts-1.0.0.tgz" + } + } + } + }`)), + }, + "https://registry.npmjs.org/@monero-project/monero-ts/-/monero-ts-1.0.0.tgz": { + StatusCode: http.StatusOK, + Body: io.NopCloser(bytes.NewReader([]byte("tarball content"))), + }, + }, + }, + } + + collector := &NPMCollector{client: client} + dn, err := collector.Collect("@monero-project/monero-ts") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if _, err := dn.Stat("metadata.json"); err != nil { + t.Errorf("expected metadata.json to exist") + } + + if _, err := dn.Stat("1.0.0.tgz"); err != nil { + t.Errorf("expected 1.0.0.tgz to exist") + } +}