Update AjouNoticesParser

requests cause SSL Error. For now, we change it to urlopen with unverifed context

Update AjouNoticesParser
41efdb4d · Seok Won · 83a8306f · 41efdb4d · 41efdb4d · 41efdb4d
Unverified Commit 41efdb4d authored 4 years ago by Seok Won
--- a/python/src/AjouSlackProducer.py
+++ b/python/src/AjouSlackProducer.py
@@ -3,8 +3,9 @@ import json
 import os
 import time
 from pathlib import Path
+from urllib.error import HTTPError
+from urllib.request import urlopen

-import requests
 from bs4 import BeautifulSoup
 from config import Config
 from confluent_kafka import Producer
@@ -199,16 +200,16 @@ class AjouParserJSON:

    # Ajou notices parser
    def parser(self):
+        context = ssl._create_unverified_context()
        try:
-            req = requests.get(
-                f"{self.ADDRESS}?mode=list&&articleLimit=10&article.offset=0"
+            result = urlopen(
+                f"{self.ADDRESS}?mode=list&&articleLimit={self.LENGTH}&article.offset=0",
+                context=context,
            )
-            req.raise_for_status()
-        except requests.exceptions.ConnectionError:
+        except HTTPError:
            print("Seems like the server is down now.")
            return None, None, None, None
-        req.encoding = "utf-8"
-        html = req.text
+        html = result.read()
        soup = BeautifulSoup(html, "html.parser")
        ids = soup.select("table > tbody > tr > td.b-num-box")
        posts = soup.select("table > tbody > tr > td.b-td-left > div > a")

--- a/python/src/AjouSlackProducerMySQL.py
+++ b/python/src/AjouSlackProducerMySQL.py
 import datetime
 import json
 import os
+import ssl
 import time
+from urllib.error import HTTPError
+from urllib.request import urlopen

 import mysql.connector
-import requests
 from bs4 import BeautifulSoup
 from config import Config
 from confluent_kafka import Producer
@@ -187,16 +189,16 @@ class AjouParser:

    # Ajou notices parser
    def parser(self):
+        context = ssl._create_unverified_context()
        try:
-            req = requests.get(
-                f"{self.ADDRESS}?mode=list&&articleLimit=10&article.offset=0"
+            result = urlopen(
+                f"{self.ADDRESS}?mode=list&&articleLimit={self.LENGTH}&article.offset=0",
+                context=context,
            )
-            req.raise_for_status()
-        except requests.exceptions.ConnectionError:
+        except HTTPError:
            print("Seems like the server is down now.")
            return None, None, None, None
-        req.encoding = "utf-8"
-        html = req.text
+        html = result.read()
        soup = BeautifulSoup(html, "html.parser")
        ids = soup.select("table > tbody > tr > td.b-num-box")
        posts = soup.select("table > tbody > tr > td.b-td-left > div > a")

--- a/python/tests/test_parser.py
+++ b/python/tests/test_parser.py
 import json
+import ssl

-import requests
+from urllib.request import urlopen
 from bs4 import BeautifulSoup


@@ -24,9 +25,13 @@ def makeJson(postId, postTitle, postDate, postLink, postWriter):


 def parser():
-    req = requests.get(f"{ADDRESS}?mode=list&&articleLimit={LENGTH}&article.offset=0")
-    req.encoding = "utf-8"
-    html = req.text
+    # req = requests.get(f"{ADDRESS}?mode=list&&articleLimit={LENGTH}&article.offset=0")
+    context = ssl._create_unverified_context()
+    result = urlopen(
+        f"{ADDRESS}?mode=list&&articleLimit={LENGTH}&article.offset=0", context=context
+    )
+
+    html = result.read()
    soup = BeautifulSoup(html, "html.parser")
    ids = soup.select("table > tbody > tr > td.b-num-box")
    posts = soup.select("table > tbody > tr > td.b-td-left > div > a")
@@ -46,14 +51,14 @@ def test_parse():
    assert len(writers) == 10, f"Check your parser: {writers}"
    for i in range(LENGTH):
        postTitle = posts[i].text.strip()
-        if FILTER_WORDS:
-            FILTERD = False
-            for filter in FILTER_WORDS:
-                if filter in postTitle:
-                    FILTERD = True
-                    break
-            if not FILTERD:
-                continue
+        # if FILTER_WORDS:
+        #     FILTERD = False
+        #     for filter in FILTER_WORDS:
+        #         if filter in postTitle:
+        #             FILTERD = True
+        #             break
+        #     if not FILTERD:
+        #         continue

        postId = ids[i].text.strip()
        postLink = posts[i].get("href")