diff options
author | Alban Gruin | 2020-09-13 16:34:44 +0200 |
---|---|---|
committer | Alban Gruin | 2020-09-13 16:42:20 +0200 |
commit | 742db32948e6c60770d6616496917077a1386dbc (patch) | |
tree | deaf0498a89a0d02af20c3c314b28e0810cdc0a8 | |
parent | d3ae708373eba13e961a70d752b36ec41cec2510 (diff) |
course: convert html entities to unicode characters
Signed-off-by: Alban Gruin <alban at pa1ch dot fr>
-rw-r--r-- | src/course.ml | 19 | ||||
-rw-r--r-- | src/dune | 6 |
2 files changed, 21 insertions, 4 deletions
diff --git a/src/course.ml b/src/course.ml index 6534691..3633ae6 100644 --- a/src/course.ml +++ b/src/course.ml @@ -19,6 +19,21 @@ open CalendarLib module J = Json_encoding +let get_unicode v = + let b = Buffer.create 1 in + Buffer.add_utf_8_uchar b (Uchar.of_int v); + Buffer.contents b + +let html_entities_regex = Re.Perl.compile_pat "&#(\\d+);" + +let replace_entities str = + Re.Pcre.full_split ~rex:html_entities_regex str + |> List.filter_map (function + | Re.Pcre.Group (_, v) -> + Some ("&#" ^ v ^ ";", get_unicode @@ int_of_string v) + | _ -> None) + |> Stringext.replace_all_assoc str + let check_groups str = let group_affixes = ["MAT-Agreg Interne "; "3EME ANNEE "; "2EME ANNEE "; "1ERE ANNEE "; "MAG1 "; "DEUST "; "M2 "; "M1 "; "L3P "; @@ -36,9 +51,9 @@ let location_and_summary str category = if not has_groups then check_groups str, "", "" else if location = "" then - true, str, "" + true, replace_entities str, "" else if summary = "" then - true, str, location + true, replace_entities str, location else true, location, summary) parts (false, "", "") in if summary = "" then @@ -2,7 +2,9 @@ (name ucs) (libraries astring calendar - lwt.unix cohttp-lwt-unix ezjsonm - ocplib-json-typed)) + lwt.unix + ocplib-json-typed + re + stringext)) |