management: mise à jour du script scraptimetables

Signed-off-by: Alban Gruin <alban at pa1ch dot fr>
author: Alban Gruin 2018-09-06 18:17:20 +0200
committer: Alban Gruin 2018-09-06 18:17:20 +0200
commit: 713e3be3351a585c40838f7f7638f68d4088f7b6 (patch)
tree: c598e2744feded9cdc6fa1ba8dde8e320921eac1 /management
parent: 080606776d14aa5fdac21d43d028b01ed3a2ed4f (diff)
1 files changed, 18 insertions, 24 deletions
diff --git a/management/commands/scraptimetables.py b/management/commands/scraptimetables.py
index 4421e46..1904b42 100644
--- a/management/commands/scraptimetables.py
+++ b/management/commands/scraptimetables.py
@@ -1,4 +1,4 @@
-#    Copyright (C) 2017  Alban Gruin
+#    Copyright (C) 2017-2018  Alban Gruin
 #
 #    celcatsanitizer is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU Affero General Public License as published
@@ -13,11 +13,12 @@
 #    You should have received a copy of the GNU Affero General Public License
 #    along with celcatsanitizer.  If not, see <http://www.gnu.org/licenses/>.
 
+from bs4 import BeautifulSoup
 from django.core.management.base import BaseCommand
-from edt.models import Timetable, Year
-from ._private import get_from_db_or_create, get_xml
+from edt.models import Source, Timetable, Year
 
 import re
+import requests
 
 
 class Command(BaseCommand):
@@ -27,38 +28,31 @@ class Command(BaseCommand):
         parser.add_argument("--url", type=str, required=True)
 
     def handle(self, *args, **options):
-        for year, name, finder in self.__get_finders(options["url"]):
-            soup = get_xml(finder)
-            for link in soup.find_all("a"):
-                if "toutes sections et semestres confondus" in link.text:
-                    url_base = finder.rsplit("/", 1)
-                    timetable_url = "/".join([url_base[0], link.parent.parent.find("a", attrs={"class": "xmllink"})["href"]])
-
-                    try:
-                        timetable = Timetable.objects.get(year=year, name=name)
-                        timetable.url = timetable_url
-                    except:
-                        timetable = Timetable(year=year, name=name, url=timetable_url)
-                    finally:
-                        timetable.save()
-
-    def __get_finders(self, url):
-        soup = get_xml(url)
+        for year, name, source in self.__get_timetables(options["url"]):
+            source, _ = Source.objects.get_or_create(url=source)
+            timetable = Timetable(year=year, name=name, source=source)
+            timetable.save()
+
+    def __get_timetables(self, url):
+        req = requests.get(url)
+        soup = BeautifulSoup(req.content, "html.parser")
         choose_regex = re.compile("^- Choisissez votre ([\w ]+) -$")
 
         for form in soup.find_all("form"):
             for i, option in enumerate(form.find_all("option")):
                 if i == 0 and option.text == "- Choisissez le niveau -":
                     break
+                if "finder.html" in option["value"]:
+                    continue
 
                 search = choose_regex.search(option.text)
                 if search is not None:
-                    current_year = get_from_db_or_create(Year, name=search.groups(0)[0])
+                    current_year, _ = Year.objects.get_or_create(
+                        name=search.groups(0)[0])
                 else:
-                    finder = option["value"].replace("finder", "finder2")
+                    url = option["value"].replace("View=week", "View=month")
                     if option.text.startswith(current_year.name):
                         name = option.text[len(current_year.name):].strip()
                     else:
                         name = option.text
-
-                    yield current_year, name, finder
+                    yield current_year, name, url
author	Alban Gruin	2018-09-06 18:17:20 +0200
committer	Alban Gruin	2018-09-06 18:17:20 +0200
commit	713e3be3351a585c40838f7f7638f68d4088f7b6 (patch)
tree	c598e2744feded9cdc6fa1ba8dde8e320921eac1 /management
parent	080606776d14aa5fdac21d43d028b01ed3a2ed4f (diff)