aboutsummaryrefslogtreecommitdiff
path: root/management/parsers
diff options
context:
space:
mode:
authorAlban Gruin2018-09-06 21:46:51 +0200
committerAlban Gruin2018-09-06 21:46:51 +0200
commit676345434415d40363c80960484abf0295ca800a (patch)
tree76c0f71fd86f19962812a63da109bf79ebd2d43c /management/parsers
parent6b8ea6615de6000ea14396fc2d31eb5c6cb159f9 (diff)
parentb4fde18263de491650c71bd31dffe3c324e97879 (diff)
Merge branch 'stable/0.14.z' into prod/pa1ch/0.y.zv0.14.0-pa1chprod/pa1ch/0.y.z
Diffstat (limited to 'management/parsers')
-rw-r--r--management/parsers/abstractparser.py52
-rw-r--r--management/parsers/ups2017.py162
-rw-r--r--management/parsers/ups2018.py213
3 files changed, 427 insertions, 0 deletions
diff --git a/management/parsers/abstractparser.py b/management/parsers/abstractparser.py
new file mode 100644
index 0000000..8d55b6d
--- /dev/null
+++ b/management/parsers/abstractparser.py
@@ -0,0 +1,52 @@
+# Copyright (C) 2018 Alban Gruin
+#
+# celcatsanitizer is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# celcatsanitizer is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with celcatsanitizer. If not, see <http://www.gnu.org/licenses/>.
+
+import abc
+import requests
+
+import edt
+
+
+class AbstractParser(metaclass=abc.ABCMeta):
+ def __init__(self, source):
+ self.source = source
+ self.user_agent = "celcatsanitizer/" + edt.VERSION
+
+ def _make_request(self, url, user_agent=None, encoding="utf8", **kwargs):
+ user_agent = user_agent if user_agent is not None else self.user_agent
+
+ params = kwargs["params"] if "params" in kwargs else {}
+ headers = kwargs["headers"] if "headers" in kwargs else {}
+ headers["User-Agent"] = user_agent
+
+ req = requests.get(url, headers=headers, params=params)
+ req.encoding = encoding
+
+ return req
+
+ @abc.abstractmethod
+ def get_events(self):
+ pass
+
+ @abc.abstractmethod
+ def get_update_date(self):
+ pass
+
+ @abc.abstractmethod
+ def get_weeks(self):
+ pass
+
+ def get_source(self):
+ return self._make_request(self.source.url)
diff --git a/management/parsers/ups2017.py b/management/parsers/ups2017.py
new file mode 100644
index 0000000..99ce34d
--- /dev/null
+++ b/management/parsers/ups2017.py
@@ -0,0 +1,162 @@
+# Copyright (C) 2017-2018 Alban Gruin
+#
+# celcatsanitizer is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# celcatsanitizer is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with celcatsanitizer. If not, see <http://www.gnu.org/licenses/>.
+
+import datetime
+import re
+
+from bs4 import BeautifulSoup
+from django.utils import timezone
+
+from ...models import Course, Group, Room
+from .abstractparser import AbstractParser
+
+
+def add_time(date, time):
+ ptime = datetime.datetime.strptime(time, "%H:%M")
+ delta = datetime.timedelta(hours=ptime.hour, minutes=ptime.minute)
+ return date + delta
+
+
+class Parser(AbstractParser):
+ def __get_event(self, event, event_week, today):
+ """Renvoie une classe Course à partir d’un événement lu par BS4"""
+ # On récupère la date de l’évènement à partir de la semaine
+ # et de la semaine référencée, puis l’heure de début et de fin
+ date = event_week + datetime.timedelta(int(event.day.text))
+ begin = add_time(date, event.starttime.text)
+ end = add_time(date, event.endtime.text)
+
+ # On ne traite pas le cours si il commence après le moment du
+ # traitement
+ if today is not None and begin < today:
+ return
+
+ # Création de l’objet cours
+ course = Course.objects.create(source=self.source, begin=begin,
+ end=end)
+
+ # On récupère les groupes concernés par les cours
+ groups = [
+ Group.objects.get_or_create(
+ source=self.source, celcat_name=item.text
+ )[0]
+ for item in event.resources.group.find_all("item")
+ ]
+ course.groups.add(*groups)
+
+ # On récupère le champ « remarque »
+ if event.notes is not None:
+ course.notes = "\n".join(event.notes.find_all(text=True))
+
+ # On récupère le champ « nom »
+ if event.resources.module is not None:
+ course.name = event.resources.module.item.text
+ elif event.category is not None:
+ # Il est possible qu’un cours n’ait pas de nom. Dans ce
+ # cas, si le cours a un type, il devient son nom.
+ course.type = event.category.text
+ # Si il n’a pas de type, il obtiendra une valeur par
+ # défaut définie à l’avance.
+
+ # Récupération du type de cours
+ if event.category is not None:
+ course.type = event.category.text
+
+ # Si un cours a une salle attribuée, on les insère dans la
+ # base de données, et on les ajoute dans l’objet cours
+ if event.resources.room is not None:
+ rooms = [
+ Room.objects.get_or_create(name=item.text)[0]
+ for item in event.resources.room.find_all("item")
+ ]
+ course.rooms.add(*rooms)
+
+ return course
+
+ def get_events(self, today, year=None, week=None):
+ """Récupère tous les cours disponibles dans l’emploi du temps Celcat.
+ Le traîtement se limitera à la semaine indiquée si il y en a une."""
+ for event in self.soup.find_all("event"):
+ event_week = self.weeks[event.rawweeks.text]
+ event_week_num = event_week.isocalendar()[1] # Numéro de semaine
+
+ # On passe le traitement si la semaine de l’événement ne
+ # correspond pas à la semaine passée, ou qu’il ne contient
+ # pas de groupe ou n’a pas de date de début ou de fin.
+ if (
+ (
+ event_week_num == week
+ and event_week.year == year
+ or year is None
+ or week is None
+ )
+ and event.resources.group is not None
+ and event.starttime is not None
+ and event.endtime is not None
+ ):
+ course = self.__get_event(event, event_week, today)
+
+ # On renvoie le cours si il n’est pas nul
+ if course is not None:
+ yield course
+
+ def get_update_date(self):
+ # Explication de la regex
+ #
+ # (\d+)/(\d+)/(\d+)\s+(\d+):(\d+):(\d+)
+ # (\d+) au moins un nombre
+ # / un slash
+ # (\d+) au moins un nombre
+ # / un slash
+ # (\d+) au moins un nombre
+ # \s+ au moins un espace
+ # (\d+) au moins un nombre
+ # : un deux-points
+ # (\d+) au moins un nombre
+ # : un deux-points
+ # (\d+) au moins un nombre
+ datetime_regex = re.compile(r"(\d+)/(\d+)/(\d+)\s+(\d+):(\d+):(\d+)")
+ search = datetime_regex.search(self.soup.footer.text)
+ if search is None:
+ return None
+
+ day, month, year, hour, minute, second = [
+ int(v) for v in search.groups()
+ ]
+ date = datetime.datetime(year, month, day, hour, minute, second)
+ return timezone.make_aware(date)
+
+ def get_weeks(self):
+ # Les semaines présentes dans l’emploi du temps sont toutes
+ # stockées dans un élément span. Il contient une chaîne de
+ # caractère qui correspond à une forme d’ID, et un champ date,
+ # qui correspond au lundi de cette semaine. Un cours contient
+ # un ID correspondant à une semaine, puis le nombre de jours
+ # après le début de cette semaine.
+ self.weeks = {}
+
+ # Liste de toutes les semaines définies
+ for span in self.soup.find_all("span"):
+ # On parse la date et on la fait correspondre à l’ID
+ self.weeks[span.alleventweeks.text] = timezone.make_aware(
+ datetime.datetime.strptime(span["date"], "%d/%m/%Y")
+ )
+
+ return self.weeks
+
+ def get_source(self):
+ req = super(Parser, self).get_source()
+ self.soup = BeautifulSoup(req.content, "html.parser")
+ return self.soup
diff --git a/management/parsers/ups2018.py b/management/parsers/ups2018.py
new file mode 100644
index 0000000..8d97517
--- /dev/null
+++ b/management/parsers/ups2018.py
@@ -0,0 +1,213 @@
+# Copyright (C) 2018 Alban Gruin
+#
+# celcatsanitizer is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# celcatsanitizer is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with celcatsanitizer. If not, see <http://www.gnu.org/licenses/>.
+
+from datetime import datetime, timedelta
+
+import asyncio
+import calendar
+import json
+
+from django.utils import timezone
+
+import lxml.html
+import requests
+
+from ...models import Course, Group, Room
+from ...utils import get_current_week, get_week
+from .abstractparser import AbstractParser
+
+VARNAME = "v.events.list = "
+
+
+def find_events_list(soup):
+ res = []
+ for script in soup.xpath("//script/text()"):
+ if VARNAME in script:
+ for var in script.split('\n'):
+ if var.startswith(VARNAME):
+ res = json.loads(var[len(VARNAME):-2])
+
+ return res
+
+
+def get_next_month(dt):
+ n = dt.replace(day=1) + timedelta(days=32)
+ return n.replace(day=1)
+
+
+class Parser(AbstractParser):
+ def __init__(self, source):
+ super(Parser, self).__init__(source)
+
+ # En-tête tiré de mon Firefox…
+ base_req = self._make_request(
+ source.url, headers={"Accept-Language": "en-US,en;q=0.5"}
+ )
+
+ parser = lxml.html.HTMLParser(encoding="utf-8")
+ self.soup = lxml.html.document_fromstring(
+ base_req.content, parser=parser
+ )
+
+ self.months = []
+ for option in self.soup.xpath("//option"):
+ if option.get("selected") is not None or len(self.months) > 0:
+ self.months.append(option.text)
+
+ def __get_event(self, event, today,
+ beginning_of_month, end_of_month,
+ year, week):
+ begin = timezone.make_aware(
+ datetime.strptime(event["start"], "%Y-%m-%dT%H:%M:%S")
+ )
+ end = timezone.make_aware(
+ datetime.strptime(event["end"], "%Y-%m-%dT%H:%M:%S")
+ )
+
+ if begin < beginning_of_month or begin >= end_of_month or \
+ (today is not None and begin < today):
+ return
+
+ if year is not None and week is not None:
+ event_year, event_week, _ = begin.isocalendar()
+ if event_year != year or event_week != week:
+ return
+
+ course = Course.objects.create(
+ source=self.source, begin=begin, end=end
+ )
+
+ data = event["text"].split("<br>")
+ rooms = None
+ if data[0] == "Global Event":
+ return
+
+ i = 1
+ while i < len(data) and not data[i].startswith(
+ ("L1 ", "L2 ", "L3 ", "L3P ", "M1 ", "M2 ", "DEUST ", "MAG1 ",
+ "1ERE ANNEE ", "2EME ANNEE ", "3EME ANNEE ",
+ "MAT-Agreg Interne ")
+ ):
+ i += 1
+
+ groups = data[i]
+ if i - 1 > 0:
+ course.name = ", ".join(set(data[i - 1].split(';')))
+ else:
+ course.name = "Sans nom"
+ if i - 2 > 0:
+ course.type = data[i - 2]
+ if len(data) >= i + 2:
+ rooms = data[i + 1]
+ if len(data) >= i + 3:
+ course.notes = data[i + 2]
+
+ groups = [
+ Group.objects.get_or_create(
+ source=self.source, celcat_name=name
+ )[0]
+ for name in groups.split(';')
+ ]
+ course.groups.add(*groups)
+
+ if rooms is not None:
+ rooms_objs = Room.objects.filter(name__in=rooms.split(';'))
+ if rooms_objs.count() > 0:
+ course.rooms.add(*rooms_objs)
+ elif course.notes:
+ course.notes = "{0}\n{1}".format(rooms, course.notes)
+ else:
+ course.notes = rooms
+
+ if course.notes is not None:
+ course.notes = course.notes.strip()
+
+ return course
+
+ def get_events(self, today, year=None, week=None):
+ for i, month in enumerate(self.events):
+ beginning_of_month = timezone.make_aware(
+ datetime.strptime(self.months[i], "%B, %Y")
+ )
+ end_of_month = get_next_month(beginning_of_month)
+
+ for event in month:
+ course = self.__get_event(event, today,
+ beginning_of_month, end_of_month,
+ year, week)
+ if course is not None:
+ yield course
+
+ def get_update_date(self):
+ return None # Pas de date de mise à jour dans ce format
+
+ def get_weeks(self):
+ # FIXME: détection automatique à partir des événements présents
+ beginning, _ = get_week(*get_current_week())
+ self.weeks = {"1": beginning}
+
+ return self.weeks
+
+ def ajax_req(self, month):
+ month = datetime.strptime(month, "%B, %Y")
+ first_monday = min(
+ week[calendar.MONDAY]
+ for week in calendar.monthcalendar(month.year, month.month)
+ if week[calendar.MONDAY] > 0
+ )
+ month_str = month.replace(day=first_monday).strftime("%Y%m%d")
+
+ req = self._make_request(
+ self.source.url,
+ headers={
+ "Accept-Language": "en-US,en;q=0.5",
+ },
+ params={"Date": month_str},
+ )
+ req.raise_for_status()
+
+ parser = lxml.html.HTMLParser(encoding="utf8")
+ soup = lxml.html.document_fromstring(req.content, parser=parser)
+
+ return find_events_list(soup)
+
+ @asyncio.coroutine
+ def get_months_async(self):
+ loop = asyncio.get_event_loop()
+ futures = []
+
+ for month in self.months[1:]:
+ futures.append(loop.run_in_executor(None, self.ajax_req, month))
+
+ responses = yield from asyncio.gather(*futures)
+ return responses
+
+ def get_source_from_months(self, async=True):
+ events = []
+
+ if async:
+ loop = asyncio.get_event_loop()
+ events = loop.run_until_complete(self.get_months_async())
+ else:
+ for month in self.months[1:]:
+ events.append(self.ajax_req(month))
+
+ return events
+
+ def get_source(self):
+ self.events = [
+ find_events_list(self.soup)
+ ] + self.get_source_from_months()
+ return self.events