aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlban Gruin2020-09-13 16:34:44 +0200
committerAlban Gruin2020-09-13 16:42:20 +0200
commit742db32948e6c60770d6616496917077a1386dbc (patch)
treedeaf0498a89a0d02af20c3c314b28e0810cdc0a8
parentd3ae708373eba13e961a70d752b36ec41cec2510 (diff)
course: convert html entities to unicode characters
Signed-off-by: Alban Gruin <alban at pa1ch dot fr>
-rw-r--r--src/course.ml19
-rw-r--r--src/dune6
2 files changed, 21 insertions, 4 deletions
diff --git a/src/course.ml b/src/course.ml
index 6534691..3633ae6 100644
--- a/src/course.ml
+++ b/src/course.ml
@@ -19,6 +19,21 @@ open CalendarLib
module J = Json_encoding
+let get_unicode v =
+ let b = Buffer.create 1 in
+ Buffer.add_utf_8_uchar b (Uchar.of_int v);
+ Buffer.contents b
+
+let html_entities_regex = Re.Perl.compile_pat "&#(\\d+);"
+
+let replace_entities str =
+ Re.Pcre.full_split ~rex:html_entities_regex str
+ |> List.filter_map (function
+ | Re.Pcre.Group (_, v) ->
+ Some ("&#" ^ v ^ ";", get_unicode @@ int_of_string v)
+ | _ -> None)
+ |> Stringext.replace_all_assoc str
+
let check_groups str =
let group_affixes = ["MAT-Agreg Interne "; "3EME ANNEE "; "2EME ANNEE ";
"1ERE ANNEE "; "MAG1 "; "DEUST "; "M2 "; "M1 "; "L3P ";
@@ -36,9 +51,9 @@ let location_and_summary str category =
if not has_groups then
check_groups str, "", ""
else if location = "" then
- true, str, ""
+ true, replace_entities str, ""
else if summary = "" then
- true, str, location
+ true, replace_entities str, location
else
true, location, summary) parts (false, "", "") in
if summary = "" then
diff --git a/src/dune b/src/dune
index 1a1135a..c29403a 100644
--- a/src/dune
+++ b/src/dune
@@ -2,7 +2,9 @@
(name ucs)
(libraries astring
calendar
- lwt.unix
cohttp-lwt-unix
ezjsonm
- ocplib-json-typed))
+ lwt.unix
+ ocplib-json-typed
+ re
+ stringext))