From fc89f6d4a5a98d2ed4c279610b098905d6f1ce84 Mon Sep 17 00:00:00 2001 From: Marcello Seri Date: Mon, 2 Mar 2026 14:36:17 +0100 Subject: [PATCH 1/4] First attempt at fix for issue #37 Signed-off-by: Marcello Seri --- doi2bib.opam.template | 2 +- doi2bib/bin/doi2bib.ml | 4 +-- doi2bib/lib/doi2bib.ml | 1 + doi2bib/lib/dune | 2 +- doi2bib/lib/helpers.ml | 54 +++++++++++++++++++++++++++++++++++++++ doi2bib/tests/doi.t/run.t | 15 +++++++++++ 6 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 doi2bib/lib/helpers.ml diff --git a/doi2bib.opam.template b/doi2bib.opam.template index 9304369..d524c24 100644 --- a/doi2bib.opam.template +++ b/doi2bib.opam.template @@ -1,5 +1,5 @@ depends: [ - "dune" {>= "2.8"} + "dune" {>= "3.0"} "ocaml" {>= "4.08"} "bibfmt" {= version} "astring" {>= "0.8.0"} diff --git a/doi2bib/bin/doi2bib.ml b/doi2bib/bin/doi2bib.ml index eaa7764..ee9ffa0 100644 --- a/doi2bib/bin/doi2bib.ml +++ b/doi2bib/bin/doi2bib.ml @@ -6,7 +6,7 @@ let process_id outfile id = let open Lwt.Syntax in let* bibtex = Http.get_bib_entry @@ Parser.parse_id id in - let parsed_items = Bibtex.parse_bibtex bibtex in + let parsed_items = Bibtex.parse_bibtex bibtex |> List.map Helpers.clean_item in let formatted = if List.length parsed_items = 0 then ( Printf.eprintf @@ -88,7 +88,7 @@ let process_file outfile infile = let write_out () = let bibtex_out = Buffer.contents bibtex_buffer in let open Bibtex in - let parsed_items = parse_bibtex bibtex_out in + let parsed_items = parse_bibtex bibtex_out |> List.map Helpers.clean_item in let options = { default_options with strict = true } in let formatted = pretty_print_bibtex ~options parsed_items in diff --git a/doi2bib/lib/doi2bib.ml b/doi2bib/lib/doi2bib.ml index 741e367..f98e793 100644 --- a/doi2bib/lib/doi2bib.ml +++ b/doi2bib/lib/doi2bib.ml @@ -1,2 +1,3 @@ module Http = Http module Parser = Parser +module Helpers = Helpers diff --git a/doi2bib/lib/dune b/doi2bib/lib/dune index 1e5ae69..0caf466 100644 --- a/doi2bib/lib/dune +++ b/doi2bib/lib/dune @@ -1,5 +1,5 @@ (library (name doi2bib) - (libraries astring cohttp-lwt-unix clz.cohttp ezxmlm lwt re unix) + (libraries astring cohttp-lwt-unix clz.cohttp ezxmlm lwt re unix bibfmt) (preprocess future_syntax) (package doi2bib)) diff --git a/doi2bib/lib/helpers.ml b/doi2bib/lib/helpers.ml new file mode 100644 index 0000000..8375bef --- /dev/null +++ b/doi2bib/lib/helpers.ml @@ -0,0 +1,54 @@ +let decode_html_entities s = + let replacements = + [ + ("&", "&"); + (""", "\""); + ("'", "'"); + ("©", "(c)"); + ("®", "(r)"); + ("™", "(tm)"); + (" ", " "); + ] + in + List.fold_left + (fun acc (pattern, replacement) -> + let re = Re.compile (Re.str pattern) in + Re.replace_string ~all:true re ~by:replacement acc) + s replacements + +let escape_ampersand s = + (* We want to match '&' that is not preceded by '\', but the re library + does not support lookbehind, so we can match either the start of the string or a non-backslash character before '&', keep it and replace + the remaining '&' by '\&' *) + let re = Re.compile (Re.seq [ Re.group (Re.alt [ Re.bos; Re.compl [ Re.char '\\' ] ]); Re.char '&' ]) in + Re.replace re + ~f:(fun subs -> + let prefix = Re.Group.get subs 1 in + prefix ^ "\\&") + s + +(* The doi API can return html entities more or less everywhere in the fields + content, so we need to replace them. So far we replace some common ones + and make sure to escape the '&'. It can still fail if the entry includes # + or % (this is already treated in URLs), but I'd wait for it to happen + before taking any further action. *) +let clean_string s = s |> decode_html_entities |> escape_ampersand + +let clean_field_value = + let open Bibtex in + function + | QuotedStringValue s -> QuotedStringValue (clean_string s) + | BracedStringValue s -> BracedStringValue (clean_string s) + | UnquotedStringValue s -> UnquotedStringValue (clean_string s) + | NumberValue n -> NumberValue n + +let clean_item = + let open Bibtex in + function + | Entry e -> + let clean_content = function + | Field f -> Field { f with value = clean_field_value f.value } + | EntryComment c -> EntryComment c + in + Entry { e with contents = List.map clean_content e.contents } + | Comment c -> Comment c diff --git a/doi2bib/tests/doi.t/run.t b/doi2bib/tests/doi.t/run.t index 0a32ebd..327e16a 100644 --- a/doi2bib/tests/doi.t/run.t +++ b/doi2bib/tests/doi.t/run.t @@ -42,3 +42,18 @@ DOI entry containing parentheses YEAR = {1989}, PAGES = {627-649} } +DOI entry containing HTML entities and ampersands + $ doi2bib 10.2140/gt.2014.18.669 + @article{Karshon_2014, + TITLE = {Classification of Hamiltonian torus actions with two-dimensional quotients}, + VOLUME = {18}, + ISSN = {1465-3060}, + URL = {http://dx.doi.org/10.2140/gt.2014.18.669}, + DOI = {10.2140/gt.2014.18.669}, + NUMBER = {2}, + JOURNAL = {Geometry \& Topology}, + PUBLISHER = {Mathematical Sciences Publishers}, + AUTHOR = {Karshon, Yael and Tolman, Susan}, + YEAR = {2014}, + PAGES = {669-716} + } From 8f79c1256da450556c890fc7e164104d06c950f2 Mon Sep 17 00:00:00 2001 From: Marcello Seri Date: Mon, 2 Mar 2026 14:45:25 +0100 Subject: [PATCH 2/4] Stop using Astring since OCaml's stdlib now is enough Signed-off-by: Marcello Seri --- doi2bib.opam | 3 +-- doi2bib.opam.template | 3 +-- doi2bib/lib/dune | 2 +- doi2bib/lib/parser.ml | 20 ++++++++++---------- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/doi2bib.opam b/doi2bib.opam index b3dc4cb..f10efc4 100644 --- a/doi2bib.opam +++ b/doi2bib.opam @@ -23,9 +23,8 @@ build: [ dev-repo: "git+https://github.com/mseri/doi2bib.git" depends: [ "dune" {>= "3.0"} - "ocaml" {>= "4.08"} + "ocaml" {>= "4.14.0"} "bibfmt" {= version} - "astring" {>= "0.8.0"} "cohttp-lwt-unix" {>= "2.5.0"} "cmdliner" {>= "1.1.0"} "clz" {>= "0.1.0"} diff --git a/doi2bib.opam.template b/doi2bib.opam.template index d524c24..f642ea5 100644 --- a/doi2bib.opam.template +++ b/doi2bib.opam.template @@ -1,8 +1,7 @@ depends: [ "dune" {>= "3.0"} - "ocaml" {>= "4.08"} + "ocaml" {>= "4.14.0"} "bibfmt" {= version} - "astring" {>= "0.8.0"} "cohttp-lwt-unix" {>= "2.5.0"} "cmdliner" {>= "1.1.0"} "clz" {>= "0.1.0"} diff --git a/doi2bib/lib/dune b/doi2bib/lib/dune index 0caf466..b1b5b8a 100644 --- a/doi2bib/lib/dune +++ b/doi2bib/lib/dune @@ -1,5 +1,5 @@ (library (name doi2bib) - (libraries astring cohttp-lwt-unix clz.cohttp ezxmlm lwt re unix bibfmt) + (libraries cohttp-lwt-unix clz.cohttp ezxmlm lwt re unix bibfmt) (preprocess future_syntax) (package doi2bib)) diff --git a/doi2bib/lib/parser.ml b/doi2bib/lib/parser.ml index 720f9e0..8129dd9 100644 --- a/doi2bib/lib/parser.ml +++ b/doi2bib/lib/parser.ml @@ -8,18 +8,17 @@ let string_of_id = function | PubMed s -> "PubMed ID '" ^ s ^ "'" let parse_id id = - let open Astring in - let is_prefix affix s = String.is_prefix ~affix (String.Ascii.lowercase s) in - let sub start s = - String.sub ~start s |> String.Sub.to_string |> String.trim + let is_prefix affix s = + let n = String.length affix in + String.length s >= n && String.sub (String.lowercase_ascii s) 0 n = affix in - let contains c s = String.exists (fun c' -> c' = c) s in + let sub start s = String.trim (String.sub s start (String.length s - start)) in match id with | doi when is_prefix "doi:" doi -> DOI (sub 4 doi) | arxiv when is_prefix "arxiv:" arxiv -> ArXiv (sub 6 arxiv) | pubmed when is_prefix "pmc" pubmed -> PubMed pubmed - | doi when contains '/' doi -> DOI (String.trim doi) - | arxiv when contains '.' arxiv -> ArXiv (String.trim arxiv) + | doi when String.contains doi '/' -> DOI (String.trim doi) + | arxiv when String.contains arxiv '.' -> ArXiv (String.trim arxiv) | _ -> raise (Parse_error id) let parse_atom id atom = @@ -42,13 +41,14 @@ let parse_atom id atom = get_attr "term" a in let bibid = - let open Astring in - (match String.cuts ~empty:false ~sep:" " authors with + (match String.split_on_char ' ' authors |> List.filter (fun s -> s <> "") with | _ :: s :: _ -> s | s :: _ -> s | [] -> "") ^ year - ^ (String.cut ~sep:" " title |> Option.map fst |> Option.value ~default:"") + ^ (match String.index_opt title ' ' with + | Some i -> String.sub title 0 i + | None -> "") in Printf.sprintf {|@misc{%s, From 3dfb07a63fce207156f308b1e345e93fe6989dd5 Mon Sep 17 00:00:00 2001 From: Marcello Seri Date: Mon, 2 Mar 2026 14:46:53 +0100 Subject: [PATCH 3/4] Remove astring from nix Signed-off-by: Marcello Seri --- nix/default.nix | 1 - 1 file changed, 1 deletion(-) diff --git a/nix/default.nix b/nix/default.nix index 308ee67..1d21588 100644 --- a/nix/default.nix +++ b/nix/default.nix @@ -23,7 +23,6 @@ buildDunePackage ({ propagatedBuildInputs = [ bibfmt - astring cohttp-lwt-unix cmdliner clz From b583c5bdd498ac36fc75e15ec08420d5ed7d11a6 Mon Sep 17 00:00:00 2001 From: Marcello Seri Date: Mon, 2 Mar 2026 14:48:17 +0100 Subject: [PATCH 4/4] run dune build @fmt Signed-off-by: Marcello Seri --- bibfmt/bin/bibfmt.ml | 10 +++++++--- bibfmt/lib/bibtex.ml | 9 +++++---- bibfmt/lib/bibtex.mli | 13 ++++++++----- doi2bib/bin/doi2bib.ml | 4 +++- doi2bib/lib/helpers.ml | 8 +++++++- doi2bib/lib/parser.ml | 15 ++++++++++----- 6 files changed, 40 insertions(+), 19 deletions(-) diff --git a/bibfmt/bin/bibfmt.ml b/bibfmt/bin/bibfmt.ml index 29d1943..4219dc5 100644 --- a/bibfmt/bin/bibfmt.ml +++ b/bibfmt/bin/bibfmt.ml @@ -55,7 +55,7 @@ let bibfmt out strict single_line quiet verbose force files = "Warning: No valid BibTeX entries found in the file.\n%!"; combined_content) else - let options = + let options = { Bibtex.default_options with strict; single_line } in Bibtex.pretty_print_bibtex ~options parse_result.items) @@ -114,7 +114,8 @@ let () = in let single_line = let doc = - "Force field values onto a single line by replacing newlines with a space." + "Force field values onto a single line by replacing newlines with a \ + space." in Arg.(value & flag & info [ "l"; "single-line" ] ~doc) in @@ -141,7 +142,10 @@ let () = Arg.(value & pos_all string [] & info [] ~docv:"FILES" ~doc) in let bibfmt_t = - Term.(ret (const bibfmt $ out $ strict $ single_line $ quiet $ verbose $ force $ files)) + Term.( + ret + (const bibfmt $ out $ strict $ single_line $ quiet $ verbose $ force + $ files)) in let info = let doc = "A little CLI tool to pretty print bibtex files." in diff --git a/bibfmt/lib/bibtex.ml b/bibfmt/lib/bibtex.ml index 2f68dbb..84d5ded 100644 --- a/bibfmt/lib/bibtex.ml +++ b/bibfmt/lib/bibtex.ml @@ -677,9 +677,8 @@ let format_entry options entry = |> List.filter (function | Field f -> ( match f.value with - | QuotedStringValue s - | BracedStringValue s - | UnquotedStringValue s -> + | QuotedStringValue s | BracedStringValue s | UnquotedStringValue s + -> String.length (String.trim s) > 0 | NumberValue _ -> true) | EntryComment _ -> true) @@ -706,7 +705,9 @@ let format_entry options entry = let contents_str = if filtered_contents = [] then "" else - let formatted_contents = List.map format_entry_content' filtered_contents in + let formatted_contents = + List.map format_entry_content' filtered_contents + in let rec add_commas_except_last = function | [] -> [] | [ last ] -> [ last ] (* No comma for the last item *) diff --git a/bibfmt/lib/bibtex.mli b/bibfmt/lib/bibtex.mli index 079ec40..c4269b4 100644 --- a/bibfmt/lib/bibtex.mli +++ b/bibfmt/lib/bibtex.mli @@ -150,12 +150,14 @@ val format_field_value_with_url_unescaping : @return String representation with URLs unescaped if applicable *) val format_field : bool -> bool -> field -> string -(** [format_field capitalized single_line field] formats a complete field (name = value). +(** [format_field capitalized single_line field] formats a complete field (name + = value). @param field The field to format @return String representation of the field *) val format_entry_content : bool -> bool -> entry_content -> string -(** [format_entry_content capitalized single_line content] formats entry content (field or comment). +(** [format_entry_content capitalized single_line content] formats entry content + (field or comment). @param content The entry content to format @return String representation of the content *) @@ -201,13 +203,14 @@ val find_duplicate_groups : Each group contains at least 2 entries that match on the specified keys. *) val string_of_field_value : field_value -> string -(** [string_of_field_value fv] converts a field value to its string representation. +(** [string_of_field_value fv] converts a field value to its string + representation. @param fv The field value to convert @return String representation of the field value *) val make_field : string -> string -> entry_content -(** [make_field name value] creates a BibTeX field with the given name and value. - The value is wrapped in braces. +(** [make_field name value] creates a BibTeX field with the given name and + value. The value is wrapped in braces. @param name Field name @param value Field value as a string @return An entry_content Field with a BracedStringValue *) diff --git a/doi2bib/bin/doi2bib.ml b/doi2bib/bin/doi2bib.ml index ee9ffa0..cf4d512 100644 --- a/doi2bib/bin/doi2bib.ml +++ b/doi2bib/bin/doi2bib.ml @@ -6,7 +6,9 @@ let process_id outfile id = let open Lwt.Syntax in let* bibtex = Http.get_bib_entry @@ Parser.parse_id id in - let parsed_items = Bibtex.parse_bibtex bibtex |> List.map Helpers.clean_item in + let parsed_items = + Bibtex.parse_bibtex bibtex |> List.map Helpers.clean_item + in let formatted = if List.length parsed_items = 0 then ( Printf.eprintf diff --git a/doi2bib/lib/helpers.ml b/doi2bib/lib/helpers.ml index 8375bef..042746f 100644 --- a/doi2bib/lib/helpers.ml +++ b/doi2bib/lib/helpers.ml @@ -20,7 +20,13 @@ let escape_ampersand s = (* We want to match '&' that is not preceded by '\', but the re library does not support lookbehind, so we can match either the start of the string or a non-backslash character before '&', keep it and replace the remaining '&' by '\&' *) - let re = Re.compile (Re.seq [ Re.group (Re.alt [ Re.bos; Re.compl [ Re.char '\\' ] ]); Re.char '&' ]) in + let re = + Re.compile + (Re.seq + [ + Re.group (Re.alt [ Re.bos; Re.compl [ Re.char '\\' ] ]); Re.char '&'; + ]) + in Re.replace re ~f:(fun subs -> let prefix = Re.Group.get subs 1 in diff --git a/doi2bib/lib/parser.ml b/doi2bib/lib/parser.ml index 8129dd9..d82283d 100644 --- a/doi2bib/lib/parser.ml +++ b/doi2bib/lib/parser.ml @@ -12,7 +12,9 @@ let parse_id id = let n = String.length affix in String.length s >= n && String.sub (String.lowercase_ascii s) 0 n = affix in - let sub start s = String.trim (String.sub s start (String.length s - start)) in + let sub start s = + String.trim (String.sub s start (String.length s - start)) + in match id with | doi when is_prefix "doi:" doi -> DOI (sub 4 doi) | arxiv when is_prefix "arxiv:" arxiv -> ArXiv (sub 6 arxiv) @@ -41,14 +43,17 @@ let parse_atom id atom = get_attr "term" a in let bibid = - (match String.split_on_char ' ' authors |> List.filter (fun s -> s <> "") with + (match + String.split_on_char ' ' authors |> List.filter (fun s -> s <> "") + with | _ :: s :: _ -> s | s :: _ -> s | [] -> "") ^ year - ^ (match String.index_opt title ' ' with - | Some i -> String.sub title 0 i - | None -> "") + ^ + match String.index_opt title ' ' with + | Some i -> String.sub title 0 i + | None -> "" in Printf.sprintf {|@misc{%s,