diff --git a/README.md b/README.md index b204aaa..248c83b 100644 --- a/README.md +++ b/README.md @@ -78,17 +78,16 @@ Further documentation can be found at . ## Utility Support -| Type | to_string | Builder Functions | Validators | -| ---------- | --------- | ----------------- | ---------- | -| Sitemap | Complete | Complete | None | -| RSS v2.0 | Complete | Complete | None | -| Robots.txt | Complete | Complete | None | -| Atom | Complete | Complete | None | +| Type | Builder Functions | to_string | from_string | +| ---------- | ----------------- | --------- | ----------- | +| Sitemap | Complete | Complete | Complete | +| RSS v2.0 | Complete | Complete | Complete | +| Robots.txt | Complete | Complete | Complete | +| Atom | Complete | Complete | None | ## Development ```sh -gleam run # Run the project gleam test # Run the tests ``` diff --git a/gleam.toml b/gleam.toml index 3158589..1bdade0 100644 --- a/gleam.toml +++ b/gleam.toml @@ -1,5 +1,5 @@ name = "webls" -version = "1.6.1" +version = "2.0.0" description = "A simple web utility library for RSS feeds, Sitemaps, Robots.txt, etc." licences = ["Apache-2.0"] @@ -8,6 +8,7 @@ repository = { type = "github", user = "versecafe", repo = "webls" } [dependencies] gleam_stdlib = ">= 0.34.0 and < 2.0.0" gleam_time = ">= 1.6.0 and < 2.0.0" +parsed_it = ">= 0.1.1 and < 0.2.0" [dev-dependencies] gleeunit = ">= 1.0.0 and < 2.0.0" diff --git a/manifest.toml b/manifest.toml index 8750955..1f7499d 100644 --- a/manifest.toml +++ b/manifest.toml @@ -4,8 +4,9 @@ packages = [ { name = "filepath", version = "1.1.2", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "filepath", source = "hex", outer_checksum = "B06A9AF0BF10E51401D64B98E4B627F1D2E48C154967DA7AF4D0914780A6D40A" }, { name = "gleam_stdlib", version = "0.68.1", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "F7FAEBD8EF260664E86A46C8DBA23508D1D11BB3BCC6EE1B89B3BC3E5C83FF1E" }, - { name = "gleam_time", version = "1.6.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_time", source = "hex", outer_checksum = "0DF3834D20193F0A38D0EB21F0A78D48F2EC276C285969131B86DF8D4EF9E762" }, + { name = "gleam_time", version = "1.7.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_time", source = "hex", outer_checksum = "56DB0EF9433826D3B99DB0B4AF7A2BFED13D09755EC64B1DAAB46F804A9AD47D" }, { name = "gleeunit", version = "1.9.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "DA9553CE58B67924B3C631F96FE3370C49EB6D6DC6B384EC4862CC4AAA718F3C" }, + { name = "parsed_it", version = "0.1.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "parsed_it", source = "hex", outer_checksum = "9F8BA3C634FEA847AD195E3322FD1DA51980F57C4171B02DCF069C6FC807944A" }, { name = "simplifile", version = "2.3.2", build_tools = ["gleam"], requirements = ["filepath", "gleam_stdlib"], otp_app = "simplifile", source = "hex", outer_checksum = "E049B4DACD4D206D87843BCF4C775A50AE0F50A52031A2FFB40C9ED07D6EC70A" }, ] @@ -13,4 +14,5 @@ packages = [ gleam_stdlib = { version = ">= 0.34.0 and < 2.0.0" } gleam_time = { version = ">= 1.6.0 and < 2.0.0" } gleeunit = { version = ">= 1.0.0 and < 2.0.0" } +parsed_it = { version = ">= 0.1.1 and < 0.2.0" } simplifile = { version = ">= 2.3.2 and < 3.0.0" } diff --git a/src/webls/robots.gleam b/src/webls/robots.gleam index 564a1e0..c76a2c7 100644 --- a/src/webls/robots.gleam +++ b/src/webls/robots.gleam @@ -1,16 +1,57 @@ +//// Functions for building and parsing robots.txt files. +//// +//// ## Building a robots.txt +//// +//// ```gleam +//// import webls/robots +//// +//// robots.config("https://example.com/sitemap.xml") +//// |> robots.with_config_robot( +//// robots.robot("*") +//// |> robots.with_robot_disallowed_route("/admin/") +//// ) +//// |> robots.to_string +//// ``` +//// +//// ## Parsing a robots.txt +//// +//// ```gleam +//// import webls/robots +//// +//// let assert Ok(config) = robots.from_string(robots_txt_content) +//// // Access config.sitemap_url and config.robots +//// ``` +//// +//// The parser handles comments, extra whitespace, and case-insensitive +//// directives. Unknown directives are ignored. Malformed lines (missing `:`) +//// return an error. + import gleam/list +import gleam/option.{type Option, None, Some} import gleam/result +import gleam/string // Stringify ------------------------------------------------------------------ +/// Converts a RobotsConfig to a robots.txt formatted string. +/// +/// The output format follows the standard robots.txt specification: +/// - Sitemap directive at the top (if present) +/// - User-agent blocks separated by blank lines +/// - Allow directives followed by Disallow directives for each agent pub fn to_string(config: RobotsConfig) -> String { - "Sitemap: " - <> config.sitemap_url - <> "\n\n" - <> config.robots - |> list.map(fn(robot) { robot |> robot_to_string }) - |> list.reduce(fn(acc, line) { acc <> "\n\n" <> line }) - |> result.unwrap("") + let sitemap_section = case config.sitemap_url { + Some(url) -> "Sitemap: " <> url <> "\n\n" + None -> "" + } + + let robots_section = + config.robots + |> list.map(fn(robot) { robot |> robot_to_string }) + |> list.reduce(fn(acc, line) { acc <> "\n\n" <> line }) + |> result.unwrap("") + + sitemap_section <> robots_section } fn robot_to_string(robot: Robot) -> String { @@ -28,11 +69,24 @@ fn robot_to_string(robot: Robot) -> String { |> result.unwrap("") } -// Builder Patern ------------------------------------------------------------- +// Builder Pattern ------------------------------------------------------------ /// Creates a robots config with a sitemap url pub fn config(sitemap_url: String) -> RobotsConfig { - RobotsConfig(sitemap_url: sitemap_url, robots: []) + RobotsConfig(sitemap_url: Some(sitemap_url), robots: []) +} + +/// Creates a robots config without a sitemap url +pub fn config_without_sitemap() -> RobotsConfig { + RobotsConfig(sitemap_url: None, robots: []) +} + +/// Sets the sitemap url on a robots config +pub fn with_config_sitemap( + config: RobotsConfig, + sitemap_url: String, +) -> RobotsConfig { + RobotsConfig(..config, sitemap_url: Some(sitemap_url)) } /// Adds a list of robots to the robots config @@ -58,7 +112,7 @@ pub fn with_robot_allowed_routes(robot: Robot, routes: List(String)) -> Robot { Robot(..robot, allowed_routes: list.flatten([robot.allowed_routes, routes])) } -/// Adds a allowed route to the robot policy +/// Adds an allowed route to the robot policy pub fn with_robot_allowed_route(robot: Robot, route: String) -> Robot { Robot(..robot, allowed_routes: [route, ..robot.allowed_routes]) } @@ -81,8 +135,8 @@ pub fn with_robot_disallowed_route(robot: Robot, route: String) -> Robot { /// The configuration for a robots.txt file pub type RobotsConfig { RobotsConfig( - /// The url of the sitemap for crawlers to use - sitemap_url: String, + /// The optional url of the sitemap for crawlers to use + sitemap_url: Option(String), /// A list of robot policies robots: List(Robot), ) @@ -99,3 +153,140 @@ pub type Robot { disallowed_routes: List(String), ) } + +/// Error returned when parsing a malformed robots.txt line +pub type RobotsParseError { + /// A line could not be parsed as a valid directive (missing `:`) + InvalidDirective(line: String) +} + +// Parse ---------------------------------------------------------------------- + +/// Parses a robots.txt string into a RobotsConfig. +/// +/// The parser handles: +/// - Case-insensitive directives (e.g., `USER-AGENT`, `user-agent`) +/// - Comments (lines starting with `#` or inline `# comment`) +/// - Extra whitespace around directives and values +/// - Unknown directives (silently ignored) +/// +/// Returns an error if a non-empty, non-comment line is malformed (missing `:`). +/// An empty config (no sitemap, no robots) is valid. +/// Directives appearing before any `User-agent:` line are ignored. +pub fn from_string(input: String) -> Result(RobotsConfig, RobotsParseError) { + let lines = + input + |> string.split("\n") + |> list.map(strip_comment) + |> list.map(string.trim) + |> list.filter(fn(line) { line != "" }) + + case validate_lines(lines) { + Error(e) -> Error(e) + Ok(_) -> { + let sitemap_url = find_sitemap(lines) + let robot_lines = list.filter(lines, fn(line) { !is_sitemap_line(line) }) + let robots = parse_robots(robot_lines, [], None) + Ok(RobotsConfig(sitemap_url: sitemap_url, robots: robots)) + } + } +} + +/// Validates that all lines are valid directives (contain `:`) +fn validate_lines(lines: List(String)) -> Result(Nil, RobotsParseError) { + case lines { + [] -> Ok(Nil) + [line, ..rest] -> + case string.contains(line, ":") { + True -> validate_lines(rest) + False -> Error(InvalidDirective(line)) + } + } +} + +/// Strips inline comments from a line (everything after `#`) +fn strip_comment(line: String) -> String { + case string.split_once(line, "#") { + Ok(#(before, _)) -> before + Error(_) -> line + } +} + +/// Splits a directive line into key and value on the first `:` +fn split_directive(line: String) -> Result(#(String, String), Nil) { + case string.split_once(line, ":") { + Ok(#(key, value)) -> Ok(#(string.trim(key), string.trim(value))) + Error(_) -> Error(Nil) + } +} + +fn is_sitemap_line(line: String) -> Bool { + case split_directive(line) { + Ok(#(key, _)) -> string.lowercase(key) == "sitemap" + Error(_) -> False + } +} + +fn find_sitemap(lines: List(String)) -> Option(String) { + lines + |> list.find(is_sitemap_line) + |> result.map(fn(line) { + case split_directive(line) { + Ok(#(_, value)) -> value + Error(_) -> "" + } + }) + |> option.from_result +} + +fn parse_robots( + lines: List(String), + acc: List(Robot), + current: Option(Robot), +) -> List(Robot) { + case lines { + [] -> + case current { + Some(r) -> list.reverse([r, ..acc]) + None -> list.reverse(acc) + } + [line, ..rest] -> { + case split_directive(line) { + Ok(#(key, value)) -> { + let lower_key = string.lowercase(key) + case lower_key { + "user-agent" -> { + let new_robot = Robot(value, [], []) + case current { + Some(r) -> parse_robots(rest, [r, ..acc], Some(new_robot)) + None -> parse_robots(rest, acc, Some(new_robot)) + } + } + _ -> + case current { + Some(r) -> { + let updated = parse_directive(lower_key, value, r) + parse_robots(rest, acc, Some(updated)) + } + None -> parse_robots(rest, acc, None) + } + } + } + Error(_) -> parse_robots(rest, acc, current) + } + } + } +} + +fn parse_directive(key: String, value: String, robot: Robot) -> Robot { + case key { + "allow" -> + Robot(..robot, allowed_routes: list.append(robot.allowed_routes, [value])) + "disallow" -> + Robot( + ..robot, + disallowed_routes: list.append(robot.disallowed_routes, [value]), + ) + _ -> robot + } +} diff --git a/src/webls/rss.gleam b/src/webls/rss.gleam index bcf9d2d..2c9dc57 100644 --- a/src/webls/rss.gleam +++ b/src/webls/rss.gleam @@ -1,9 +1,12 @@ +import gleam/dynamic/decode import gleam/int import gleam/list import gleam/option.{type Option, None, Some} import gleam/result +import gleam/string import gleam/time/calendar import gleam/time/timestamp.{type Timestamp} +import parsed_it/xml // Stringify ------------------------------------------------------------------ @@ -613,3 +616,319 @@ pub type Weekday { Saturday Sunday } + +// Decoders ------------------------------------------------------------------- + +/// Parses an RSS XML string into a list of RssChannels +pub fn from_string( + rss_xml: String, +) -> Result(List(RssChannel), xml.XmlDecodeError) { + xml.parse(from: rss_xml, using: rss_decoder()) +} + +fn rss_decoder() -> decode.Decoder(List(RssChannel)) { + // rss -> channel (single or list) + use channels <- decode.field( + "channel", + decode.one_of(decode.list(channel_decoder()), [ + channel_decoder() |> decode.map(fn(ch) { [ch] }), + ]), + ) + decode.success(channels) +} + +fn channel_decoder() -> decode.Decoder(RssChannel) { + use title <- decode.field("title", text_decoder()) + use link <- decode.field("link", text_decoder()) + use description <- decode.field("description", text_decoder()) + use language <- decode.optional_field( + "language", + None, + decode.optional(text_decoder()), + ) + use copyright <- decode.optional_field( + "copyright", + None, + decode.optional(text_decoder()), + ) + use managing_editor <- decode.optional_field( + "managingEditor", + None, + decode.optional(text_decoder()), + ) + use web_master <- decode.optional_field( + "webMaster", + None, + decode.optional(text_decoder()), + ) + use pub_date <- decode.optional_field( + "pubDate", + None, + decode.optional(timestamp_decoder()), + ) + use last_build_date <- decode.optional_field( + "lastBuildDate", + None, + decode.optional(timestamp_decoder()), + ) + use categories <- decode.optional_field("category", [], categories_decoder()) + use generator <- decode.optional_field( + "generator", + None, + decode.optional(text_decoder()), + ) + use docs <- decode.optional_field( + "docs", + None, + decode.optional(text_decoder()), + ) + use cloud <- decode.optional_field( + "cloud", + None, + decode.optional(cloud_decoder()), + ) + use ttl <- decode.optional_field( + "ttl", + None, + decode.optional(int_text_decoder()), + ) + use image <- decode.optional_field( + "image", + None, + decode.optional(image_decoder()), + ) + use text_input <- decode.optional_field( + "textInput", + None, + decode.optional(text_input_decoder()), + ) + use skip_hours <- decode.optional_field("skipHours", [], skip_hours_decoder()) + use skip_days <- decode.optional_field("skipDays", [], skip_days_decoder()) + use items <- decode.optional_field( + "item", + [], + decode.one_of(decode.list(item_decoder()), [ + item_decoder() |> decode.map(fn(item) { [item] }), + ]), + ) + decode.success(RssChannel( + title:, + link:, + description:, + language:, + copyright:, + managing_editor:, + web_master:, + pub_date:, + last_build_date:, + categories:, + generator:, + docs:, + cloud:, + ttl:, + image:, + text_input:, + skip_hours:, + skip_days:, + items:, + )) +} + +fn item_decoder() -> decode.Decoder(RssItem) { + use title <- decode.field("title", text_decoder()) + use description <- decode.field("description", text_decoder()) + use link <- decode.optional_field( + "link", + None, + decode.optional(text_decoder()), + ) + use author <- decode.optional_field( + "author", + None, + decode.optional(text_decoder()), + ) + use comments <- decode.optional_field( + "comments", + None, + decode.optional(text_decoder()), + ) + use source <- decode.optional_field( + "source", + None, + decode.optional(text_decoder()), + ) + use pub_date <- decode.optional_field( + "pubDate", + None, + decode.optional(timestamp_decoder()), + ) + use categories <- decode.optional_field("category", [], categories_decoder()) + use enclosure <- decode.optional_field( + "enclosure", + None, + decode.optional(enclosure_decoder()), + ) + use guid <- decode.optional_field( + "guid", + None, + decode.optional(guid_decoder()), + ) + decode.success(RssItem( + title:, + description:, + link:, + author:, + comments:, + source:, + pub_date:, + categories:, + enclosure:, + guid:, + )) +} + +fn text_decoder() -> decode.Decoder(String) { + decode.at(["$text"], decode.string) +} + +fn int_text_decoder() -> decode.Decoder(Int) { + use str <- decode.then(text_decoder()) + case int.parse(str) { + Ok(i) -> decode.success(i) + Error(_) -> decode.failure(0, "Int") + } +} + +fn timestamp_decoder() -> decode.Decoder(Timestamp) { + use date_str <- decode.then(text_decoder()) + case timestamp.parse_rfc3339(date_str) { + Ok(ts) -> decode.success(ts) + Error(_) -> decode.failure(timestamp.from_unix_seconds(0), "Timestamp") + } +} + +fn categories_decoder() -> decode.Decoder(List(String)) { + decode.one_of(decode.list(text_decoder()), [ + text_decoder() |> decode.map(fn(cat) { [cat] }), + ]) +} + +fn cloud_decoder() -> decode.Decoder(Cloud) { + // Cloud element uses attributes: domain, port, path, registerProcedure, protocol + use domain <- decode.field("$attrs", decode.at(["domain"], decode.string)) + use port <- decode.field("$attrs", decode.at(["port"], string_int_decoder())) + use path <- decode.field("$attrs", decode.at(["path"], decode.string)) + use register_procedure <- decode.field( + "$attrs", + decode.at(["registerProcedure"], decode.string), + ) + use protocol <- decode.field("$attrs", decode.at(["protocol"], decode.string)) + decode.success(Cloud(domain:, port:, path:, register_procedure:, protocol:)) +} + +fn string_int_decoder() -> decode.Decoder(Int) { + use str <- decode.then(decode.string) + case int.parse(str) { + Ok(i) -> decode.success(i) + Error(_) -> decode.failure(0, "Int") + } +} + +fn image_decoder() -> decode.Decoder(Image) { + use url <- decode.field("url", text_decoder()) + use title <- decode.field("title", text_decoder()) + use link <- decode.field("link", text_decoder()) + use description <- decode.optional_field( + "description", + None, + decode.optional(text_decoder()), + ) + use width <- decode.optional_field( + "width", + None, + decode.optional(int_text_decoder()), + ) + use height <- decode.optional_field( + "height", + None, + decode.optional(int_text_decoder()), + ) + decode.success(Image(url:, title:, link:, description:, width:, height:)) +} + +fn text_input_decoder() -> decode.Decoder(TextInput) { + use title <- decode.field("title", text_decoder()) + use description <- decode.field("description", text_decoder()) + use name <- decode.field("name", text_decoder()) + use link <- decode.field("link", text_decoder()) + decode.success(TextInput(title:, description:, name:, link:)) +} + +fn enclosure_decoder() -> decode.Decoder(Enclosure) { + // Enclosure element uses attributes: url, length, type + use url <- decode.field("$attrs", decode.at(["url"], decode.string)) + use length <- decode.field( + "$attrs", + decode.at(["length"], string_int_decoder()), + ) + use enclosure_type <- decode.field( + "$attrs", + decode.at(["type"], decode.string), + ) + decode.success(Enclosure(url:, length:, enclosure_type:)) +} + +fn guid_decoder() -> decode.Decoder(#(String, Option(Bool))) { + use guid_text <- decode.field("$text", decode.string) + use is_permalink <- decode.optional_field( + "$attrs", + None, + decode.optional(decode.at(["isPermaLink"], bool_string_decoder())), + ) + decode.success(#(guid_text, is_permalink)) +} + +fn bool_string_decoder() -> decode.Decoder(Bool) { + use s <- decode.then(decode.string) + case string.lowercase(s) { + "true" -> decode.success(True) + "false" -> decode.success(False) + _ -> decode.failure(False, "Bool") + } +} + +fn skip_hours_decoder() -> decode.Decoder(List(Int)) { + use hours <- decode.optional_field( + "hour", + [], + decode.one_of(decode.list(int_text_decoder()), [ + int_text_decoder() |> decode.map(fn(h) { [h] }), + ]), + ) + decode.success(hours) +} + +fn skip_days_decoder() -> decode.Decoder(List(Weekday)) { + use days <- decode.optional_field( + "day", + [], + decode.one_of(decode.list(weekday_decoder()), [ + weekday_decoder() |> decode.map(fn(d) { [d] }), + ]), + ) + decode.success(days) +} + +fn weekday_decoder() -> decode.Decoder(Weekday) { + use day_str <- decode.then(text_decoder()) + case string.lowercase(day_str) { + "monday" -> decode.success(Monday) + "tuesday" -> decode.success(Tuesday) + "wednesday" -> decode.success(Wednesday) + "thursday" -> decode.success(Thursday) + "friday" -> decode.success(Friday) + "saturday" -> decode.success(Saturday) + "sunday" -> decode.success(Sunday) + _ -> decode.failure(Monday, "Weekday") + } +} diff --git a/src/webls/sitemap.gleam b/src/webls/sitemap.gleam index 8deaef7..4a9796d 100644 --- a/src/webls/sitemap.gleam +++ b/src/webls/sitemap.gleam @@ -1,9 +1,12 @@ +import gleam/dynamic/decode import gleam/float +import gleam/int import gleam/list import gleam/option.{type Option, None, Some} import gleam/result import gleam/time/calendar import gleam/time/timestamp.{type Timestamp} +import parsed_it/xml // Stringify ------------------------------------------------------------------ @@ -20,6 +23,34 @@ pub fn to_string(sitemap: Sitemap) -> String { <> "\n" } +/// Generates a sitemap index XML string from a sitemap index +pub fn index_to_string(index: SitemapIndex) -> String { + let sitemap_content = + index.sitemaps + |> list.map(fn(ref) { ref |> sitemap_reference_to_string }) + |> list.reduce(fn(acc, ref_string) { acc <> "\n" <> ref_string }) + |> result.unwrap("") + + "\n\n" + <> sitemap_content + <> "\n" +} + +fn sitemap_reference_to_string(ref: SitemapReference) -> String { + "\n" + <> "" + <> ref.loc + <> "\n" + <> case ref.last_modified { + Some(date) -> + "" + <> date |> timestamp.to_rfc3339(calendar.utc_offset) + <> "\n" + _ -> "" + } + <> "" +} + fn sitemap_item_to_string(item: SitemapItem) -> String { "\n" <> "" @@ -113,6 +144,40 @@ pub fn with_item_last_modified( SitemapItem(..item, last_modified: Some(modified)) } +/// Create an empty sitemap index +pub fn sitemap_index() -> SitemapIndex { + SitemapIndex(sitemaps: []) +} + +/// Adds a sitemap reference to the sitemap index +pub fn with_index_sitemap( + index: SitemapIndex, + ref: SitemapReference, +) -> SitemapIndex { + SitemapIndex(sitemaps: [ref, ..index.sitemaps]) +} + +/// Adds a list of sitemap references to the sitemap index +pub fn with_index_sitemaps( + index: SitemapIndex, + refs: List(SitemapReference), +) -> SitemapIndex { + SitemapIndex(sitemaps: list.flatten([index.sitemaps, refs])) +} + +/// Create a sitemap reference with a URL location +pub fn reference(loc: String) -> SitemapReference { + SitemapReference(loc: loc, last_modified: None) +} + +/// Add a last modified time to a sitemap reference +pub fn with_reference_last_modified( + ref: SitemapReference, + last_modified: Timestamp, +) -> SitemapReference { + SitemapReference(..ref, last_modified: Some(last_modified)) +} + // Types ---------------------------------------------------------------------- /// A complete sitemap @@ -127,6 +192,24 @@ pub type Sitemap { ) } +/// A sitemap index that references multiple sitemaps +pub type SitemapIndex { + SitemapIndex( + /// The list of sitemap references + sitemaps: List(SitemapReference), + ) +} + +/// A reference to a sitemap within a sitemap index +pub type SitemapReference { + SitemapReference( + /// The location URL of the sitemap + loc: String, + /// The time of last modification of the referenced sitemap + last_modified: Option(Timestamp), + ) +} + /// A item within a sitemap pub type SitemapItem { SitemapItem( @@ -152,3 +235,129 @@ pub type ChangeFrequency { Yearly Never } + +// Decoders ------------------------------------------------------------------- + +/// Result of parsing a sitemap XML - either a regular sitemap or an index +pub type SitemapParseResult { + ParsedSitemap(Sitemap) + ParsedSitemapIndex(SitemapIndex) +} + +/// Parses a sitemap XML string into a Sitemap +pub fn from_string(sitemap_xml: String) -> Result(Sitemap, xml.XmlDecodeError) { + xml.parse(from: sitemap_xml, using: sitemap_decoder()) +} + +/// Parses a sitemap index XML string into a SitemapIndex +pub fn index_from_string( + sitemap_xml: String, +) -> Result(SitemapIndex, xml.XmlDecodeError) { + xml.parse(from: sitemap_xml, using: sitemap_index_decoder()) +} + +/// Parses a sitemap XML string, detecting whether it's a regular sitemap or index +/// Returns a SitemapParseResult indicating which type was parsed +pub fn parse( + sitemap_xml: String, +) -> Result(SitemapParseResult, xml.XmlDecodeError) { + // Try parsing as regular sitemap first + case from_string(sitemap_xml) { + Ok(sitemap) -> Ok(ParsedSitemap(sitemap)) + Error(_) -> + // Try parsing as sitemap index + case index_from_string(sitemap_xml) { + Ok(index) -> Ok(ParsedSitemapIndex(index)) + Error(e) -> Error(e) + } + } +} + +fn sitemap_decoder() -> decode.Decoder(Sitemap) { + // urlset is the root element, url children contain the items + // When there are multiple elements, they become a list + // When there's a single element, it's a single object + use items <- decode.field( + "url", + decode.one_of(decode.list(sitemap_item_decoder()), [ + sitemap_item_decoder() |> decode.map(fn(item) { [item] }), + ]), + ) + decode.success(Sitemap(url: "", last_modified: None, items:)) +} + +fn change_frequency_decoder() -> decode.Decoder(ChangeFrequency) { + use variant <- decode.then(decode.at(["$text"], decode.string)) + case variant { + "always" -> decode.success(Always) + "hourly" -> decode.success(Hourly) + "daily" -> decode.success(Daily) + "weekly" -> decode.success(Weekly) + "monthly" -> decode.success(Monthly) + "yearly" -> decode.success(Yearly) + "never" -> decode.success(Never) + _ -> decode.failure(Never, "ChangeFrequency") + } +} + +fn sitemap_item_decoder() -> decode.Decoder(SitemapItem) { + use loc <- decode.field("loc", decode.at(["$text"], decode.string)) + use last_modified <- decode.optional_field( + "lastmod", + None, + decode.optional(timestamp_decoder()), + ) + use change_frequency <- decode.optional_field( + "changefreq", + None, + decode.optional(change_frequency_decoder()), + ) + use priority <- decode.optional_field( + "priority", + None, + decode.optional(decode.at(["$text"], string_float_decoder())), + ) + decode.success(SitemapItem(loc:, last_modified:, change_frequency:, priority:)) +} + +fn string_float_decoder() -> decode.Decoder(Float) { + use str <- decode.then(decode.string) + case float.parse(str) { + Ok(f) -> decode.success(f) + Error(_) -> + // Try parsing as int and convert to float + case int.parse(str) { + Ok(i) -> decode.success(int.to_float(i)) + Error(_) -> decode.failure(0.0, "Float") + } + } +} + +fn timestamp_decoder() -> decode.Decoder(Timestamp) { + use date_str <- decode.then(decode.at(["$text"], decode.string)) + case timestamp.parse_rfc3339(date_str) { + Ok(ts) -> decode.success(ts) + Error(_) -> decode.failure(timestamp.from_unix_seconds(0), "Timestamp") + } +} + +fn sitemap_index_decoder() -> decode.Decoder(SitemapIndex) { + // sitemapindex is the root element, sitemap children contain the references + use sitemaps <- decode.field( + "sitemap", + decode.one_of(decode.list(sitemap_reference_decoder()), [ + sitemap_reference_decoder() |> decode.map(fn(ref) { [ref] }), + ]), + ) + decode.success(SitemapIndex(sitemaps:)) +} + +fn sitemap_reference_decoder() -> decode.Decoder(SitemapReference) { + use loc <- decode.field("loc", decode.at(["$text"], decode.string)) + use last_modified <- decode.optional_field( + "lastmod", + None, + decode.optional(timestamp_decoder()), + ) + decode.success(SitemapReference(loc:, last_modified:)) +} diff --git a/test/fixtures/robots/case_insensitive.txt b/test/fixtures/robots/case_insensitive.txt new file mode 100644 index 0000000..5e7350e --- /dev/null +++ b/test/fixtures/robots/case_insensitive.txt @@ -0,0 +1,4 @@ +SITEMAP: https://example.com/sitemap.xml +USER-AGENT: googlebot +ALLOW: /posts/ +DISALLOW: /admin/ \ No newline at end of file diff --git a/test/fixtures/robots/flipped_order.txt b/test/fixtures/robots/flipped_order.txt new file mode 100644 index 0000000..f5478a0 --- /dev/null +++ b/test/fixtures/robots/flipped_order.txt @@ -0,0 +1,3 @@ +User-agent: googlebot +Disallow: /admin/ +Allow: /posts/ \ No newline at end of file diff --git a/test/fixtures/robots/no_sitemap.txt b/test/fixtures/robots/no_sitemap.txt new file mode 100644 index 0000000..f56fd91 --- /dev/null +++ b/test/fixtures/robots/no_sitemap.txt @@ -0,0 +1,2 @@ +User-agent: googlebot +Allow: /posts/ \ No newline at end of file diff --git a/test/fixtures/robots.txt b/test/fixtures/robots/robots.txt similarity index 100% rename from test/fixtures/robots.txt rename to test/fixtures/robots/robots.txt diff --git a/test/fixtures/robots/whitespace.txt b/test/fixtures/robots/whitespace.txt new file mode 100644 index 0000000..0b78393 --- /dev/null +++ b/test/fixtures/robots/whitespace.txt @@ -0,0 +1,6 @@ + + Sitemap: https://example.com/sitemap.xml + + User-agent: * + Allow: / + diff --git a/test/fixtures/robots/with_comments.txt b/test/fixtures/robots/with_comments.txt new file mode 100644 index 0000000..b7d07ed --- /dev/null +++ b/test/fixtures/robots/with_comments.txt @@ -0,0 +1,6 @@ +# This is a robots.txt file with comments +Sitemap: https://example.com/sitemap.xml + +User-agent: googlebot # Google's crawler +Allow: /posts/ +Disallow: /admin/ # Keep admin private \ No newline at end of file diff --git a/test/fixtures/rss/full_channel.xml b/test/fixtures/rss/full_channel.xml new file mode 100644 index 0000000..d30b870 --- /dev/null +++ b/test/fixtures/rss/full_channel.xml @@ -0,0 +1,27 @@ + + + +Full Featured Feed +https://example.com +An RSS feed with all channel fields +en-us +Copyright 2024 Example Inc. +editor@example.com +webmaster@example.com +2024-01-15T12:00:00.000Z +2024-06-20T08:30:00.000Z +Technology +webls +https://www.rssboard.org/rss-2-0-1 +30 + +Full Item +An item with all fields +https://example.com/full-item +author@example.com +https://example.com/full-item/comments +2024-06-20T08:00:00.000Z +Original Source +https://example.com/full-item + + \ No newline at end of file diff --git a/test/fixtures/rss/minimal.xml b/test/fixtures/rss/minimal.xml new file mode 100644 index 0000000..f448dae --- /dev/null +++ b/test/fixtures/rss/minimal.xml @@ -0,0 +1,8 @@ + + + +Minimal Feed +https://example.com +A minimal RSS feed + + \ No newline at end of file diff --git a/test/fixtures/rss.xml b/test/fixtures/rss/rss.xml similarity index 100% rename from test/fixtures/rss.xml rename to test/fixtures/rss/rss.xml diff --git a/test/fixtures/rss/single_item.xml b/test/fixtures/rss/single_item.xml new file mode 100644 index 0000000..9af98a4 --- /dev/null +++ b/test/fixtures/rss/single_item.xml @@ -0,0 +1,11 @@ + + + +Single Item Feed +https://example.com +A feed with just one item + +Only Item +The only item in this feed + + \ No newline at end of file diff --git a/test/fixtures/rss/with_categories.xml b/test/fixtures/rss/with_categories.xml new file mode 100644 index 0000000..d0f22d9 --- /dev/null +++ b/test/fixtures/rss/with_categories.xml @@ -0,0 +1,15 @@ + + + +Categorized Feed +https://example.com +An RSS feed with categories +Technology +Programming + +Gleam Article +An article about Gleam +Gleam +Functional Programming + + \ No newline at end of file diff --git a/test/fixtures/rss/with_cloud.xml b/test/fixtures/rss/with_cloud.xml new file mode 100644 index 0000000..9e1e8b9 --- /dev/null +++ b/test/fixtures/rss/with_cloud.xml @@ -0,0 +1,13 @@ + + + +Cloud Feed +https://example.com +An RSS feed with cloud configuration + +60 + +Test Item +Test description + + \ No newline at end of file diff --git a/test/fixtures/rss/with_enclosure.xml b/test/fixtures/rss/with_enclosure.xml new file mode 100644 index 0000000..0e6fb83 --- /dev/null +++ b/test/fixtures/rss/with_enclosure.xml @@ -0,0 +1,18 @@ + + + +Podcast Feed +https://example.com/podcast +A podcast feed with enclosures + +Episode 1 +The first episode + + + +Episode 2 +The second episode +https://example.com/podcast/ep2 + + + \ No newline at end of file diff --git a/test/fixtures/rss/with_image.xml b/test/fixtures/rss/with_image.xml new file mode 100644 index 0000000..d7ace5c --- /dev/null +++ b/test/fixtures/rss/with_image.xml @@ -0,0 +1,18 @@ + + + +Feed with Image +https://example.com +An RSS feed with an image +https://example.com/logo.png +Example Logo +https://example.com +The logo for Example +144 +88 + + +Test Item +Test description + + \ No newline at end of file diff --git a/test/fixtures/rss/with_skip_hours_days.xml b/test/fixtures/rss/with_skip_hours_days.xml new file mode 100644 index 0000000..99d3c7c --- /dev/null +++ b/test/fixtures/rss/with_skip_hours_days.xml @@ -0,0 +1,16 @@ + + + +Skip Schedule Feed +https://example.com +An RSS feed with skip hours and days +0 +1 +2 +Saturday +Sunday + +Test Item +Test description + + \ No newline at end of file diff --git a/test/fixtures/rss/with_text_input.xml b/test/fixtures/rss/with_text_input.xml new file mode 100644 index 0000000..f0d98a1 --- /dev/null +++ b/test/fixtures/rss/with_text_input.xml @@ -0,0 +1,16 @@ + + + +Searchable Feed +https://example.com +An RSS feed with text input +Search +Search this feed +q +https://example.com/search + + +Test Item +Test description + + \ No newline at end of file diff --git a/test/fixtures/sitemap/all_frequencies.xml b/test/fixtures/sitemap/all_frequencies.xml new file mode 100644 index 0000000..f1628ef --- /dev/null +++ b/test/fixtures/sitemap/all_frequencies.xml @@ -0,0 +1,31 @@ + + + +https://example.com/always +always + + +https://example.com/hourly +hourly + + +https://example.com/daily +daily + + +https://example.com/weekly +weekly + + +https://example.com/monthly +monthly + + +https://example.com/yearly +yearly + + +https://example.com/never +never + + \ No newline at end of file diff --git a/test/fixtures/sitemap/minimal.xml b/test/fixtures/sitemap/minimal.xml new file mode 100644 index 0000000..55edfde --- /dev/null +++ b/test/fixtures/sitemap/minimal.xml @@ -0,0 +1,6 @@ + + + +https://example.com + + \ No newline at end of file diff --git a/test/fixtures/sitemap/single_item.xml b/test/fixtures/sitemap/single_item.xml new file mode 100644 index 0000000..23e170d --- /dev/null +++ b/test/fixtures/sitemap/single_item.xml @@ -0,0 +1,8 @@ + + + +https://example.com/single +weekly +0.8 + + \ No newline at end of file diff --git a/test/fixtures/sitemap.xml b/test/fixtures/sitemap/sitemap.xml similarity index 100% rename from test/fixtures/sitemap.xml rename to test/fixtures/sitemap/sitemap.xml diff --git a/test/fixtures/sitemap/sitemap_index.xml b/test/fixtures/sitemap/sitemap_index.xml new file mode 100644 index 0000000..17558aa --- /dev/null +++ b/test/fixtures/sitemap/sitemap_index.xml @@ -0,0 +1,9 @@ + + + +https://example.com/sitemap-0.xml + + +https://example.com/sitemap-1.xml + + diff --git a/test/fixtures/sitemap/with_lastmod.xml b/test/fixtures/sitemap/with_lastmod.xml new file mode 100644 index 0000000..3b4d499 --- /dev/null +++ b/test/fixtures/sitemap/with_lastmod.xml @@ -0,0 +1,13 @@ + + + +https://example.com +2024-06-15T10:30:00.000Z +daily +1.0 + + +https://example.com/about +2024-05-01T08:00:00.000Z + + \ No newline at end of file diff --git a/test/fixtures/sitemap/with_priorities.xml b/test/fixtures/sitemap/with_priorities.xml new file mode 100644 index 0000000..94fff03 --- /dev/null +++ b/test/fixtures/sitemap/with_priorities.xml @@ -0,0 +1,19 @@ + + + +https://example.com/high +1.0 + + +https://example.com/medium +0.5 + + +https://example.com/low +0.1 + + +https://example.com/zero +0 + + \ No newline at end of file diff --git a/test/robots_test.gleam b/test/robots_test.gleam index 4f5b436..7c11e98 100644 --- a/test/robots_test.gleam +++ b/test/robots_test.gleam @@ -1,3 +1,4 @@ +import gleam/option import gleeunit/should import simplifile import webls/robots @@ -17,9 +18,153 @@ pub fn robots_to_string_test() -> Nil { |> robots.with_robot_disallowed_routes(["/"]), ]) - let assert Ok(expected) = simplifile.read("test/fixtures/robots.txt") + let assert Ok(expected) = simplifile.read("test/fixtures/robots/robots.txt") config |> robots.to_string |> should.equal(expected) } + +/// Confirms that a robots.txt string can be parsed into a RobotsConfig +pub fn robots_from_string_test() -> Nil { + let assert Ok(input) = simplifile.read("test/fixtures/robots/robots.txt") + + let assert Ok(config) = robots.from_string(input) + + config.sitemap_url + |> should.equal(option.Some("https://example.com/sitemap.xml")) + + config.robots + |> should.equal([ + robots.Robot("googlebot", ["/posts/", "/contact/"], ["/admin/", "/private/"]), + robots.Robot("bingbot", ["/posts/", "/contact/", "/private/"], ["/"]), + ]) +} + +/// Confirms roundtrip: to_string -> from_string produces equivalent config +pub fn robots_roundtrip_test() -> Nil { + let original = + robots.config("https://example.com/sitemap.xml") + |> robots.with_config_robots([ + robots.robot("googlebot") + |> robots.with_robot_allowed_routes(["/posts/", "/contact/"]) + |> robots.with_robot_disallowed_routes(["/admin/", "/private/"]), + robots.robot("bingbot") + |> robots.with_robot_allowed_routes([ + "/posts/", "/contact/", "/private/", + ]) + |> robots.with_robot_disallowed_routes(["/"]), + ]) + + let serialized = robots.to_string(original) + let assert Ok(parsed) = robots.from_string(serialized) + + parsed.sitemap_url + |> should.equal(original.sitemap_url) + + parsed.robots + |> should.equal(original.robots) +} + +/// Confirms that parsing works when Sitemap directive is missing (it's optional) +pub fn robots_from_string_no_sitemap_test() -> Nil { + let assert Ok(input) = simplifile.read("test/fixtures/robots/no_sitemap.txt") + + let assert Ok(config) = robots.from_string(input) + + config.sitemap_url + |> should.equal(option.None) + + config.robots + |> should.equal([robots.Robot("googlebot", ["/posts/"], [])]) +} + +/// Confirms parsing handles extra whitespace and blank lines +pub fn robots_from_string_whitespace_test() -> Nil { + let assert Ok(input) = simplifile.read("test/fixtures/robots/whitespace.txt") + + let assert Ok(config) = robots.from_string(input) + + config.sitemap_url + |> should.equal(option.Some("https://example.com/sitemap.xml")) + + config.robots + |> should.equal([robots.Robot("*", ["/"], [])]) +} + +/// Confirms parsing is case-insensitive for directives +pub fn robots_from_string_case_insensitive_test() -> Nil { + let assert Ok(input) = + simplifile.read("test/fixtures/robots/case_insensitive.txt") + + let assert Ok(config) = robots.from_string(input) + + config.sitemap_url + |> should.equal(option.Some("https://example.com/sitemap.xml")) + + config.robots + |> should.equal([robots.Robot("googlebot", ["/posts/"], ["/admin/"])]) +} + +/// Confirms parsing handles comments (full-line and inline) +pub fn robots_from_string_comments_test() -> Nil { + let assert Ok(input) = + simplifile.read("test/fixtures/robots/with_comments.txt") + + let assert Ok(config) = robots.from_string(input) + + config.sitemap_url + |> should.equal(option.Some("https://example.com/sitemap.xml")) + + config.robots + |> should.equal([robots.Robot("googlebot", ["/posts/"], ["/admin/"])]) +} + +/// Confirms parsing works with Disallow before Allow +pub fn robots_from_string_flipped_order_test() -> Nil { + let assert Ok(input) = + simplifile.read("test/fixtures/robots/flipped_order.txt") + + let assert Ok(config) = robots.from_string(input) + + config.robots + |> should.equal([robots.Robot("googlebot", ["/posts/"], ["/admin/"])]) +} + +/// Confirms parsing fails on malformed lines (missing colon) +pub fn robots_from_string_invalid_test() -> Nil { + let input = "User-agent: googlebot\nthis is not a valid directive\nAllow: /" + + robots.from_string(input) + |> should.equal( + Error(robots.InvalidDirective("this is not a valid directive")), + ) +} + +/// Confirms empty input returns empty config (not an error) +pub fn robots_from_string_empty_test() -> Nil { + let assert Ok(config) = robots.from_string("") + + config.sitemap_url + |> should.equal(option.None) + + config.robots + |> should.equal([]) +} + +/// Confirms config_without_sitemap builder works +pub fn robots_config_without_sitemap_test() -> Nil { + let config = + robots.config_without_sitemap() + |> robots.with_config_robot( + robots.robot("*") + |> robots.with_robot_disallowed_route("/admin/"), + ) + + config.sitemap_url + |> should.equal(option.None) + + config + |> robots.to_string + |> should.equal("User-agent: *\n\nDisallow: /admin/") +} diff --git a/test/rss_test.gleam b/test/rss_test.gleam index afd1e7b..9f231a0 100644 --- a/test/rss_test.gleam +++ b/test/rss_test.gleam @@ -1,4 +1,4 @@ -import gleam/option.{Some} +import gleam/option.{None, Some} import gleam/time/calendar import gleam/time/timestamp import gleeunit/should @@ -27,9 +27,460 @@ pub fn rss_to_string_test() -> Nil { ]), ] - let assert Ok(expected) = simplifile.read("test/fixtures/rss.xml") + let assert Ok(expected) = simplifile.read("test/fixtures/rss/rss.xml") channels |> rss.to_string() |> should.equal(expected) } + +/// Confirms that from_string parses the RSS fixture correctly +pub fn rss_from_string_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/rss/rss.xml") + let assert Ok(channels) = rss.from_string(xml) + + // Should have one channel + let assert [channel] = channels + + // Verify channel metadata + channel.title |> should.equal("Gleam RSS") + channel.link |> should.equal("https://gleam.run") + channel.description |> should.equal("A test RSS feed") + channel.language |> should.equal(Some("en")) + channel.categories |> should.equal(["Releases"]) + + // Verify items were parsed correctly + let assert [item1, item2] = channel.items + + // First item + item1.title |> should.equal("Gleam 1.0") + item1.description |> should.equal("Gleam 1.0 is here!") + item1.link |> should.equal(Some("https://gleam.run/blog/gleam-1.0")) + item1.author |> should.equal(None) + item1.guid |> should.equal(Some(#("gleam 1.0", Some(False)))) + + // Second item + item2.title |> should.equal("Gleam 0.10") + item2.description |> should.equal("Gleam 0.10 is here!") + item2.link |> should.equal(Some("https://gleam.run/blog/gleam-0.10")) + item2.author |> should.equal(Some("user@example.com")) + item2.guid |> should.equal(Some(#("gleam 0.10", Some(True)))) +} + +/// Confirms roundtrip: to_string -> from_string produces equivalent channel +pub fn rss_roundtrip_test() -> Nil { + let original = [ + rss.channel("Test Feed", "A test feed", "https://example.com") + |> rss.with_channel_language("en-us") + |> rss.with_channel_items([ + rss.item("Article 1", "First article content") + |> rss.with_item_link("https://example.com/article-1"), + ]), + ] + + let serialized = rss.to_string(original) + let assert Ok(parsed) = rss.from_string(serialized) + + let assert [orig_channel] = original + let assert [parsed_channel] = parsed + + parsed_channel.title |> should.equal(orig_channel.title) + parsed_channel.link |> should.equal(orig_channel.link) + parsed_channel.description |> should.equal(orig_channel.description) + parsed_channel.language |> should.equal(orig_channel.language) +} + +/// Confirms parsing of minimal RSS feed (just required fields) +pub fn rss_from_string_minimal_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/rss/minimal.xml") + let assert Ok(channels) = rss.from_string(xml) + + let assert [channel] = channels + + channel.title |> should.equal("Minimal Feed") + channel.link |> should.equal("https://example.com") + channel.description |> should.equal("A minimal RSS feed") + channel.language |> should.equal(None) + channel.copyright |> should.equal(None) + channel.items |> should.equal([]) +} + +/// Confirms parsing of RSS feed with image +pub fn rss_from_string_with_image_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/rss/with_image.xml") + let assert Ok(channels) = rss.from_string(xml) + + let assert [channel] = channels + let assert Some(image) = channel.image + + image.url |> should.equal("https://example.com/logo.png") + image.title |> should.equal("Example Logo") + image.link |> should.equal("https://example.com") + image.description |> should.equal(Some("The logo for Example")) + image.width |> should.equal(Some(144)) + image.height |> should.equal(Some(88)) +} + +/// Confirms parsing of RSS feed with enclosures (podcast style) +pub fn rss_from_string_with_enclosure_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/rss/with_enclosure.xml") + let assert Ok(channels) = rss.from_string(xml) + + let assert [channel] = channels + let assert [item1, item2] = channel.items + + let assert Some(enc1) = item1.enclosure + enc1.url |> should.equal("https://example.com/ep1.mp3") + enc1.length |> should.equal(12_345_678) + enc1.enclosure_type |> should.equal("audio/mpeg") + + let assert Some(enc2) = item2.enclosure + enc2.url |> should.equal("https://example.com/ep2.mp3") + enc2.length |> should.equal(9_876_543) +} + +/// Confirms parsing of RSS feed with cloud configuration +pub fn rss_from_string_with_cloud_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/rss/with_cloud.xml") + let assert Ok(channels) = rss.from_string(xml) + + let assert [channel] = channels + let assert Some(cloud) = channel.cloud + + cloud.domain |> should.equal("rpc.example.com") + cloud.port |> should.equal(80) + cloud.path |> should.equal("/RPC2") + cloud.register_procedure |> should.equal("pingMe") + cloud.protocol |> should.equal("soap") + + channel.ttl |> should.equal(Some(60)) +} + +/// Confirms parsing of RSS feed with text input +pub fn rss_from_string_with_text_input_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/rss/with_text_input.xml") + let assert Ok(channels) = rss.from_string(xml) + + let assert [channel] = channels + let assert Some(text_input) = channel.text_input + + text_input.title |> should.equal("Search") + text_input.description |> should.equal("Search this feed") + text_input.name |> should.equal("q") + text_input.link |> should.equal("https://example.com/search") +} + +/// Confirms parsing of RSS feed with skip hours and days +pub fn rss_from_string_with_skip_hours_days_test() -> Nil { + let assert Ok(xml) = + simplifile.read("test/fixtures/rss/with_skip_hours_days.xml") + let assert Ok(channels) = rss.from_string(xml) + + let assert [channel] = channels + + channel.skip_hours |> should.equal([0, 1, 2]) + channel.skip_days |> should.equal([rss.Saturday, rss.Sunday]) +} + +/// Confirms parsing of RSS feed with categories +pub fn rss_from_string_with_categories_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/rss/with_categories.xml") + let assert Ok(channels) = rss.from_string(xml) + + let assert [channel] = channels + + channel.categories |> should.equal(["Technology", "Programming"]) + + let assert [item] = channel.items + item.categories |> should.equal(["Gleam", "Functional Programming"]) +} + +/// Confirms parsing of full-featured RSS channel +pub fn rss_from_string_full_channel_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/rss/full_channel.xml") + let assert Ok(channels) = rss.from_string(xml) + + let assert [channel] = channels + + channel.title |> should.equal("Full Featured Feed") + channel.language |> should.equal(Some("en-us")) + channel.copyright |> should.equal(Some("Copyright 2024 Example Inc.")) + channel.managing_editor |> should.equal(Some("editor@example.com")) + channel.web_master |> should.equal(Some("webmaster@example.com")) + channel.pub_date |> should.be_some + channel.last_build_date |> should.be_some + channel.generator |> should.equal(Some("webls")) + channel.docs |> should.equal(Some("https://www.rssboard.org/rss-2-0-1")) + channel.ttl |> should.equal(Some(30)) + + let assert [item] = channel.items + item.title |> should.equal("Full Item") + item.link |> should.equal(Some("https://example.com/full-item")) + item.author |> should.equal(Some("author@example.com")) + item.comments |> should.equal(Some("https://example.com/full-item/comments")) + item.source |> should.equal(Some("Original Source")) + item.guid + |> should.equal(Some(#("https://example.com/full-item", Some(True)))) +} + +/// Confirms parsing handles single item (not wrapped in list) +pub fn rss_from_string_single_item_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/rss/single_item.xml") + let assert Ok(channels) = rss.from_string(xml) + + let assert [channel] = channels + let assert [item] = channel.items + + item.title |> should.equal("Only Item") + item.description |> should.equal("The only item in this feed") +} + +/// Confirms channel builder with_channel_copyright works +pub fn rss_with_channel_copyright_test() -> Nil { + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_copyright("Copyright 2024") + + channel.copyright |> should.equal(Some("Copyright 2024")) +} + +/// Confirms channel builder with_channel_managing_editor works +pub fn rss_with_channel_managing_editor_test() -> Nil { + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_managing_editor("editor@example.com") + + channel.managing_editor |> should.equal(Some("editor@example.com")) +} + +/// Confirms channel builder with_channel_web_master works +pub fn rss_with_channel_web_master_test() -> Nil { + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_web_master("webmaster@example.com") + + channel.web_master |> should.equal(Some("webmaster@example.com")) +} + +/// Confirms channel builder with_channel_pub_date works +pub fn rss_with_channel_pub_date_test() -> Nil { + let ts = + timestamp.from_calendar( + calendar.Date(2024, calendar.June, 15), + calendar.TimeOfDay(10, 30, 0, 0), + calendar.utc_offset, + ) + + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_pub_date(ts) + + channel.pub_date |> should.equal(Some(ts)) +} + +/// Confirms channel builder with_channel_last_build_date works +pub fn rss_with_channel_last_build_date_test() -> Nil { + let ts = + timestamp.from_calendar( + calendar.Date(2024, calendar.June, 15), + calendar.TimeOfDay(10, 30, 0, 0), + calendar.utc_offset, + ) + + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_last_build_date(ts) + + channel.last_build_date |> should.equal(Some(ts)) +} + +/// Confirms channel builder with_channel_categories works +pub fn rss_with_channel_categories_test() -> Nil { + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_categories(["Tech", "Programming"]) + + channel.categories |> should.equal(["Tech", "Programming"]) +} + +/// Confirms channel builder with_channel_generator works +pub fn rss_with_channel_generator_test() -> Nil { + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_generator() + + channel.generator |> should.equal(Some("webls")) +} + +/// Confirms channel builder with_channel_custom_generator works +pub fn rss_with_channel_custom_generator_test() -> Nil { + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_custom_generator("MyApp v1.0") + + channel.generator |> should.equal(Some("MyApp v1.0")) +} + +/// Confirms channel builder with_channel_docs works +pub fn rss_with_channel_docs_test() -> Nil { + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_docs() + + channel.docs |> should.equal(Some("https://www.rssboard.org/rss-2-0-1")) +} + +/// Confirms channel builder with_channel_cloud works +pub fn rss_with_channel_cloud_test() -> Nil { + let cloud = + rss.Cloud( + domain: "rpc.example.com", + port: 80, + path: "/RPC2", + register_procedure: "pingMe", + protocol: "soap", + ) + + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_cloud(cloud) + + channel.cloud |> should.equal(Some(cloud)) +} + +/// Confirms channel builder with_channel_ttl works +pub fn rss_with_channel_ttl_test() -> Nil { + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_ttl(60) + + channel.ttl |> should.equal(Some(60)) +} + +/// Confirms channel builder with_channel_image works +pub fn rss_with_channel_image_test() -> Nil { + let image = + rss.Image( + url: "https://example.com/logo.png", + title: "Logo", + link: "https://example.com", + description: Some("A logo"), + width: Some(100), + height: Some(50), + ) + + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_image(image) + + channel.image |> should.equal(Some(image)) +} + +/// Confirms channel builder with_channel_text_input works +pub fn rss_with_channel_text_input_test() -> Nil { + let text_input = + rss.TextInput( + title: "Search", + description: "Search the feed", + name: "q", + link: "https://example.com/search", + ) + + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_text_input(text_input) + + channel.text_input |> should.equal(Some(text_input)) +} + +/// Confirms channel builder with_channel_skip_hours works +pub fn rss_with_channel_skip_hours_test() -> Nil { + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_skip_hours([0, 1, 2, 3]) + + channel.skip_hours |> should.equal([0, 1, 2, 3]) +} + +/// Confirms channel builder with_channel_skip_days works +pub fn rss_with_channel_skip_days_test() -> Nil { + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_skip_days([rss.Saturday, rss.Sunday]) + + channel.skip_days |> should.equal([rss.Saturday, rss.Sunday]) +} + +/// Confirms channel builder with_channel_item (singular) works +pub fn rss_with_channel_item_test() -> Nil { + let channel = + rss.channel("Test", "Desc", "https://example.com") + |> rss.with_channel_item(rss.item("Item 1", "Description 1")) + |> rss.with_channel_item(rss.item("Item 2", "Description 2")) + + let assert [item2, item1] = channel.items + item1.title |> should.equal("Item 1") + item2.title |> should.equal("Item 2") +} + +/// Confirms item builder with_item_categories works +pub fn rss_with_item_categories_test() -> Nil { + let item = + rss.item("Test", "Desc") + |> rss.with_item_categories(["Tech", "News"]) + + item.categories |> should.equal(["Tech", "News"]) +} + +/// Confirms item builder with_item_comments works +pub fn rss_with_item_comments_test() -> Nil { + let item = + rss.item("Test", "Desc") + |> rss.with_item_comments("https://example.com/comments") + + item.comments |> should.equal(Some("https://example.com/comments")) +} + +/// Confirms item builder with_item_enclosure works +pub fn rss_with_item_enclosure_test() -> Nil { + let enclosure = + rss.Enclosure( + url: "https://example.com/audio.mp3", + length: 12_345_678, + enclosure_type: "audio/mpeg", + ) + + let item = + rss.item("Test", "Desc") + |> rss.with_item_enclosure(enclosure) + + item.enclosure |> should.equal(Some(enclosure)) +} + +/// Confirms item builder with_item_source works +pub fn rss_with_item_source_test() -> Nil { + let item = + rss.item("Test", "Desc") + |> rss.with_item_source("Original Feed") + + item.source |> should.equal(Some("Original Feed")) +} + +/// Confirms empty channel produces valid XML +pub fn rss_empty_channel_test() -> Nil { + let channels = [rss.channel("Empty", "An empty feed", "https://example.com")] + + let result = rss.to_string(channels) + + result + |> should.equal( + " + + +Empty +https://example.com +An empty feed + +", + ) +} diff --git a/test/sitemap_test.gleam b/test/sitemap_test.gleam index b748adf..4abc11d 100644 --- a/test/sitemap_test.gleam +++ b/test/sitemap_test.gleam @@ -1,3 +1,6 @@ +import gleam/option.{None, Some} +import gleam/time/calendar +import gleam/time/timestamp import gleeunit/should import simplifile import webls/sitemap @@ -16,9 +19,316 @@ pub fn sitemap_to_string_test() -> Nil { sitemap.item("https://gleam.run/blog/gleam-1.1"), ]) - let assert Ok(expected) = simplifile.read("test/fixtures/sitemap.xml") + let assert Ok(expected) = simplifile.read("test/fixtures/sitemap/sitemap.xml") sitemap |> sitemap.to_string() |> should.equal(expected) } + +/// Confirms that from_string parses the sitemap fixture correctly +pub fn sitemap_from_string_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/sitemap/sitemap.xml") + let assert Ok(parsed) = sitemap.from_string(xml) + + // Verify the items were parsed correctly + parsed.items + |> should.equal([ + sitemap.SitemapItem( + loc: "https://gleam.run", + last_modified: None, + change_frequency: Some(sitemap.Monthly), + priority: Some(1.0), + ), + sitemap.SitemapItem( + loc: "https://gleam.run/blog", + last_modified: None, + change_frequency: Some(sitemap.Weekly), + priority: None, + ), + sitemap.SitemapItem( + loc: "https://gleam.run/blog/gleam-1.0", + last_modified: None, + change_frequency: None, + priority: None, + ), + sitemap.SitemapItem( + loc: "https://gleam.run/blog/gleam-1.1", + last_modified: None, + change_frequency: None, + priority: None, + ), + ]) +} + +/// Confirms roundtrip: to_string -> from_string produces equivalent config +pub fn sitemap_roundtrip_test() -> Nil { + let original = + sitemap.sitemap("https://example.com/sitemap.xml") + |> sitemap.with_sitemap_items([ + sitemap.item("https://example.com") + |> sitemap.with_item_frequency(sitemap.Daily) + |> sitemap.with_item_priority(1.0), + sitemap.item("https://example.com/about") + |> sitemap.with_item_frequency(sitemap.Monthly) + |> sitemap.with_item_priority(0.5), + ]) + + let serialized = sitemap.to_string(original) + let assert Ok(parsed) = sitemap.from_string(serialized) + + parsed.items + |> should.equal(original.items) +} + +/// Confirms parsing of sitemap with lastmod dates +pub fn sitemap_from_string_with_lastmod_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/sitemap/with_lastmod.xml") + let assert Ok(parsed) = sitemap.from_string(xml) + + let assert [item1, item2] = parsed.items + + item1.loc |> should.equal("https://example.com") + item1.change_frequency |> should.equal(Some(sitemap.Daily)) + item1.priority |> should.equal(Some(1.0)) + item1.last_modified |> should.be_some + + item2.loc |> should.equal("https://example.com/about") + item2.last_modified |> should.be_some + item2.change_frequency |> should.equal(None) + item2.priority |> should.equal(None) +} + +/// Confirms parsing of minimal sitemap with just loc +pub fn sitemap_from_string_minimal_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/sitemap/minimal.xml") + let assert Ok(parsed) = sitemap.from_string(xml) + + let assert [item] = parsed.items + + item.loc |> should.equal("https://example.com") + item.last_modified |> should.equal(None) + item.change_frequency |> should.equal(None) + item.priority |> should.equal(None) +} + +/// Confirms parsing handles all change frequency values +pub fn sitemap_from_string_all_frequencies_test() -> Nil { + let assert Ok(xml) = + simplifile.read("test/fixtures/sitemap/all_frequencies.xml") + let assert Ok(parsed) = sitemap.from_string(xml) + + let assert [always, hourly, daily, weekly, monthly, yearly, never] = + parsed.items + + always.change_frequency |> should.equal(Some(sitemap.Always)) + hourly.change_frequency |> should.equal(Some(sitemap.Hourly)) + daily.change_frequency |> should.equal(Some(sitemap.Daily)) + weekly.change_frequency |> should.equal(Some(sitemap.Weekly)) + monthly.change_frequency |> should.equal(Some(sitemap.Monthly)) + yearly.change_frequency |> should.equal(Some(sitemap.Yearly)) + never.change_frequency |> should.equal(Some(sitemap.Never)) +} + +/// Confirms parsing handles single item (not wrapped in list) +pub fn sitemap_from_string_single_item_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/sitemap/single_item.xml") + let assert Ok(parsed) = sitemap.from_string(xml) + + let assert [item] = parsed.items + + item.loc |> should.equal("https://example.com/single") + item.change_frequency |> should.equal(Some(sitemap.Weekly)) + item.priority |> should.equal(Some(0.8)) +} + +/// Confirms parsing handles various priority values +pub fn sitemap_from_string_with_priorities_test() -> Nil { + let assert Ok(xml) = + simplifile.read("test/fixtures/sitemap/with_priorities.xml") + let assert Ok(parsed) = sitemap.from_string(xml) + + let assert [high, medium, low, zero] = parsed.items + + high.priority |> should.equal(Some(1.0)) + medium.priority |> should.equal(Some(0.5)) + low.priority |> should.equal(Some(0.1)) + zero.priority |> should.equal(Some(0.0)) +} + +/// Confirms builder with_sitemap_item adds single item +pub fn sitemap_with_single_item_test() -> Nil { + let sitemap = + sitemap.sitemap("https://example.com/sitemap.xml") + |> sitemap.with_sitemap_item(sitemap.item("https://example.com/page1")) + |> sitemap.with_sitemap_item(sitemap.item("https://example.com/page2")) + + sitemap.items + |> should.equal([ + sitemap.SitemapItem( + loc: "https://example.com/page2", + last_modified: None, + change_frequency: None, + priority: None, + ), + sitemap.SitemapItem( + loc: "https://example.com/page1", + last_modified: None, + change_frequency: None, + priority: None, + ), + ]) +} + +/// Confirms with_sitemap_last_modified sets the sitemap's last modified time +pub fn sitemap_with_last_modified_test() -> Nil { + let ts = + timestamp.from_calendar( + calendar.Date(2024, calendar.June, 15), + calendar.TimeOfDay(10, 30, 0, 0), + calendar.utc_offset, + ) + + let sitemap = + sitemap.sitemap("https://example.com/sitemap.xml") + |> sitemap.with_sitemap_last_modified(ts) + + sitemap.last_modified |> should.equal(Some(ts)) +} + +/// Confirms item builder with_item_last_modified works +pub fn sitemap_item_with_last_modified_test() -> Nil { + let ts = + timestamp.from_calendar( + calendar.Date(2024, calendar.June, 15), + calendar.TimeOfDay(10, 30, 0, 0), + calendar.utc_offset, + ) + + let item = + sitemap.item("https://example.com") + |> sitemap.with_item_last_modified(ts) + + item.last_modified |> should.equal(Some(ts)) +} + +/// Confirms priority clamping in to_string (values > 1.0 clamped to 1.0) +pub fn sitemap_priority_clamp_high_test() -> Nil { + let sm = + sitemap.sitemap("https://example.com/sitemap.xml") + |> sitemap.with_sitemap_items([ + sitemap.item("https://example.com") + |> sitemap.with_item_priority(2.0), + ]) + + // Should contain priority 1.0, not 2.0 + sm + |> sitemap.to_string() + |> should.equal( + " + + +https://example.com +1.0 + +", + ) +} + +/// Confirms priority clamping in to_string (values < 0.0 clamped to 0.0) +pub fn sitemap_priority_clamp_low_test() -> Nil { + let sm = + sitemap.sitemap("https://example.com/sitemap.xml") + |> sitemap.with_sitemap_items([ + sitemap.item("https://example.com") + |> sitemap.with_item_priority(-0.5), + ]) + + // Should contain priority 0.0, not -0.5 + sm + |> sitemap.to_string() + |> should.equal( + " + + +https://example.com +0.0 + +", + ) +} + +/// Confirms empty sitemap produces valid XML +pub fn sitemap_empty_test() -> Nil { + let sm = sitemap.sitemap("https://example.com/sitemap.xml") + + sm + |> sitemap.to_string() + |> should.equal( + " + + +", + ) +} + +/// Confirms parsing of sitemap index files +pub fn sitemap_index_from_string_test() -> Nil { + let assert Ok(xml) = + simplifile.read("test/fixtures/sitemap/sitemap_index.xml") + let assert Ok(parsed) = sitemap.index_from_string(xml) + + parsed.sitemaps + |> should.equal([ + sitemap.SitemapReference( + loc: "https://example.com/sitemap-0.xml", + last_modified: None, + ), + sitemap.SitemapReference( + loc: "https://example.com/sitemap-1.xml", + last_modified: None, + ), + ]) +} + +/// Confirms parse function correctly detects regular sitemap +pub fn sitemap_parse_regular_test() -> Nil { + let assert Ok(xml) = simplifile.read("test/fixtures/sitemap/minimal.xml") + let assert Ok(result) = sitemap.parse(xml) + + case result { + sitemap.ParsedSitemap(sm) -> { + let assert [item] = sm.items + item.loc |> should.equal("https://example.com") + } + sitemap.ParsedSitemapIndex(_) -> { + panic as "Expected ParsedSitemap, got ParsedSitemapIndex" + } + } +} + +/// Confirms parse function correctly detects sitemap index +pub fn sitemap_parse_index_test() -> Nil { + let assert Ok(xml) = + simplifile.read("test/fixtures/sitemap/sitemap_index.xml") + let assert Ok(result) = sitemap.parse(xml) + + case result { + sitemap.ParsedSitemap(_) -> { + panic as "Expected ParsedSitemapIndex, got ParsedSitemap" + } + sitemap.ParsedSitemapIndex(index) -> { + index.sitemaps + |> should.equal([ + sitemap.SitemapReference( + loc: "https://example.com/sitemap-0.xml", + last_modified: None, + ), + sitemap.SitemapReference( + loc: "https://example.com/sitemap-1.xml", + last_modified: None, + ), + ]) + } + } +}