diff options
author | Alban Gruin | 2020-09-13 16:34:44 +0200 |
---|---|---|
committer | Alban Gruin | 2020-09-13 16:42:20 +0200 |
commit | 742db32948e6c60770d6616496917077a1386dbc (patch) | |
tree | deaf0498a89a0d02af20c3c314b28e0810cdc0a8 /src/course.ml | |
parent | d3ae708373eba13e961a70d752b36ec41cec2510 (diff) |
course: convert html entities to unicode characters
Signed-off-by: Alban Gruin <alban at pa1ch dot fr>
Diffstat (limited to 'src/course.ml')
-rw-r--r-- | src/course.ml | 19 |
1 files changed, 17 insertions, 2 deletions
diff --git a/src/course.ml b/src/course.ml index 6534691..3633ae6 100644 --- a/src/course.ml +++ b/src/course.ml @@ -19,6 +19,21 @@ open CalendarLib module J = Json_encoding +let get_unicode v = + let b = Buffer.create 1 in + Buffer.add_utf_8_uchar b (Uchar.of_int v); + Buffer.contents b + +let html_entities_regex = Re.Perl.compile_pat "&#(\\d+);" + +let replace_entities str = + Re.Pcre.full_split ~rex:html_entities_regex str + |> List.filter_map (function + | Re.Pcre.Group (_, v) -> + Some ("&#" ^ v ^ ";", get_unicode @@ int_of_string v) + | _ -> None) + |> Stringext.replace_all_assoc str + let check_groups str = let group_affixes = ["MAT-Agreg Interne "; "3EME ANNEE "; "2EME ANNEE "; "1ERE ANNEE "; "MAG1 "; "DEUST "; "M2 "; "M1 "; "L3P "; @@ -36,9 +51,9 @@ let location_and_summary str category = if not has_groups then check_groups str, "", "" else if location = "" then - true, str, "" + true, replace_entities str, "" else if summary = "" then - true, str, location + true, replace_entities str, location else true, location, summary) parts (false, "", "") in if summary = "" then |