diff options
| author | Alban Gruin | 2020-09-13 16:34:44 +0200 | 
|---|---|---|
| committer | Alban Gruin | 2020-09-13 16:42:20 +0200 | 
| commit | 742db32948e6c60770d6616496917077a1386dbc (patch) | |
| tree | deaf0498a89a0d02af20c3c314b28e0810cdc0a8 /src | |
| parent | d3ae708373eba13e961a70d752b36ec41cec2510 (diff) | |
course: convert html entities to unicode characters
Signed-off-by: Alban Gruin <alban at pa1ch dot fr>
Diffstat (limited to 'src')
| -rw-r--r-- | src/course.ml | 19 | ||||
| -rw-r--r-- | src/dune | 6 | 
2 files changed, 21 insertions, 4 deletions
| diff --git a/src/course.ml b/src/course.ml index 6534691..3633ae6 100644 --- a/src/course.ml +++ b/src/course.ml @@ -19,6 +19,21 @@ open CalendarLib  module J = Json_encoding +let get_unicode v = +  let b = Buffer.create 1 in +  Buffer.add_utf_8_uchar b (Uchar.of_int v); +  Buffer.contents b + +let html_entities_regex = Re.Perl.compile_pat "&#(\\d+);" + +let replace_entities str = +  Re.Pcre.full_split ~rex:html_entities_regex str +  |> List.filter_map (function +         | Re.Pcre.Group (_, v) -> +            Some ("&#" ^ v ^ ";", get_unicode @@ int_of_string v) +         | _ -> None) +  |> Stringext.replace_all_assoc str +  let check_groups str =    let group_affixes = ["MAT-Agreg Interne "; "3EME ANNEE "; "2EME ANNEE ";                         "1ERE ANNEE "; "MAG1 "; "DEUST "; "M2 "; "M1 "; "L3P "; @@ -36,9 +51,9 @@ let location_and_summary str category =          if not has_groups then            check_groups str, "", ""          else if location = "" then -          true, str, "" +          true, replace_entities str, ""          else if summary = "" then -          true, str, location +          true, replace_entities str, location          else            true, location, summary) parts (false, "", "") in    if summary = "" then @@ -2,7 +2,9 @@   (name ucs)   (libraries astring              calendar -            lwt.unix              cohttp-lwt-unix              ezjsonm -            ocplib-json-typed)) +            lwt.unix +            ocplib-json-typed +            re +            stringext)) | 
