diff options
| author | Alban Gruin | 2018-10-09 20:30:07 +0200 | 
|---|---|---|
| committer | Alban Gruin | 2018-10-09 20:30:07 +0200 | 
| commit | 11f340b4c3adb4200ff41e7e4587392b10b13e47 (patch) | |
| tree | 38e55db95168f24a1f287b643850cd16cf32b621 | |
| parent | 536061a041b5ea0ea0b64a8f4e5fa3b99a49861a (diff) | |
ups2018: vérification de la présence des événements dans la pageag/ups2018-correctifs-apres-tests
Il arrive que les pages retournées par la source sont incomplètes et
ne contiennent pas d’événement, et parfois des mois entiers sont
vides.
On tente donc de récupérer une page trois fois au maximum, et, si cela
échoue toujours, on abandonne en renvoyant une exception.
Signed-off-by: Alban Gruin <alban at pa1ch dot fr>
| -rw-r--r-- | management/parsers/ups2018.py | 67 | 
1 files changed, 37 insertions, 30 deletions
| diff --git a/management/parsers/ups2018.py b/management/parsers/ups2018.py index 522a26a..f1da5bf 100644 --- a/management/parsers/ups2018.py +++ b/management/parsers/ups2018.py @@ -28,20 +28,22 @@ import requests  from ...models import Course, Group, Room  from ...utils import get_current_week, get_week -from .abstractparser import AbstractParser +from .abstractparser import AbstractParser, ParserError  VARNAME = "v.events.list = "  def find_events_list(soup):      res = [] +    found = False      for script in soup.xpath("//script/text()"):          if VARNAME in script:              for var in script.split('\n'):                  if var.startswith(VARNAME):                      res = json.loads(var[len(VARNAME):-2]) +                    found = True -    return res +    return res, found  def get_next_month(dt): @@ -52,21 +54,40 @@ def get_next_month(dt):  class Parser(AbstractParser):      def __init__(self, source):          super(Parser, self).__init__(source) +        self.events = [self._make_request(source.url)] +        self.source = source + +    def _make_request(self, url, date=None): +        events, found = [], False +        attempts = 0 +        params = {} + +        if date is not None: +            params["Date"] = date + +        while not found: +            if attempts == 3: +                raise ParserError("Failed to retrieve {0}".format(url)) +            attempts += 1 + +            # En-tête tiré de mon Firefox… +            req = super(Parser, self)._make_request( +                url, params=params, +                headers={"Accept-Language": "en-US,en;q=0.5"}, +            ) +            req.raise_for_status() -        # En-tête tiré de mon Firefox… -        base_req = self._make_request( -            source.url, headers={"Accept-Language": "en-US,en;q=0.5"} -        ) +            parser = lxml.html.HTMLParser(encoding="utf8") +            soup = lxml.html.document_fromstring(req.content, parser=parser) +            events, found = find_events_list(soup) -        parser = lxml.html.HTMLParser(encoding="utf-8") -        self.soup = lxml.html.document_fromstring( -            base_req.content, parser=parser -        ) +        if date is None: +            self.months = [] +            for option in soup.xpath("//option"): +                if option.get("selected") is not None or len(self.months) > 0: +                    self.months.append(option.text) -        self.months = [] -        for option in self.soup.xpath("//option"): -            if option.get("selected") is not None or len(self.months) > 0: -                self.months.append(option.text) +        return events      def __get_event(self, event, today,                      beginning_of_month, end_of_month, @@ -179,19 +200,7 @@ class Parser(AbstractParser):          )          month_str = month.replace(day=first_monday).strftime("%Y%m%d") -        req = self._make_request( -            self.source.url, -            headers={ -                "Accept-Language": "en-US,en;q=0.5", -            }, -            params={"Date": month_str}, -        ) -        req.raise_for_status() - -        parser = lxml.html.HTMLParser(encoding="utf8") -        soup = lxml.html.document_fromstring(req.content, parser=parser) - -        return find_events_list(soup) +        return self._make_request(self.source.url, month_str)      @asyncio.coroutine      def get_months_async(self): @@ -217,7 +226,5 @@ class Parser(AbstractParser):          return events      def get_source(self): -        self.events = [ -            find_events_list(self.soup) -        ] + self.get_source_from_months() +        self.events += self.get_source_from_months()          return self.events | 
