From 41efdb4d779017ee9b6ae8af6ed07d59f622ff66 Mon Sep 17 00:00:00 2001 From: Seok Won <ikr@kakao.com> Date: Sun, 20 Dec 2020 22:20:25 +0900 Subject: [PATCH] Update AjouNoticesParser requests cause SSL Error. For now, we change it to urlopen with unverifed context --- python/src/AjouSlackProducer.py | 15 +++++++------- python/src/AjouSlackProducerMySQL.py | 16 ++++++++------- python/tests/test_parser.py | 29 ++++++++++++++++------------ 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/python/src/AjouSlackProducer.py b/python/src/AjouSlackProducer.py index 665e753..f389f2b 100644 --- a/python/src/AjouSlackProducer.py +++ b/python/src/AjouSlackProducer.py @@ -3,8 +3,9 @@ import json import os import time from pathlib import Path +from urllib.error import HTTPError +from urllib.request import urlopen -import requests from bs4 import BeautifulSoup from config import Config from confluent_kafka import Producer @@ -199,16 +200,16 @@ class AjouParserJSON: # Ajou notices parser def parser(self): + context = ssl._create_unverified_context() try: - req = requests.get( - f"{self.ADDRESS}?mode=list&&articleLimit=10&article.offset=0" + result = urlopen( + f"{self.ADDRESS}?mode=list&&articleLimit={self.LENGTH}&article.offset=0", + context=context, ) - req.raise_for_status() - except requests.exceptions.ConnectionError: + except HTTPError: print("Seems like the server is down now.") return None, None, None, None - req.encoding = "utf-8" - html = req.text + html = result.read() soup = BeautifulSoup(html, "html.parser") ids = soup.select("table > tbody > tr > td.b-num-box") posts = soup.select("table > tbody > tr > td.b-td-left > div > a") diff --git a/python/src/AjouSlackProducerMySQL.py b/python/src/AjouSlackProducerMySQL.py index 6f7a547..1ed9fbf 100644 --- a/python/src/AjouSlackProducerMySQL.py +++ b/python/src/AjouSlackProducerMySQL.py @@ -1,10 +1,12 @@ import datetime import json import os +import ssl import time +from urllib.error import HTTPError +from urllib.request import urlopen import mysql.connector -import requests from bs4 import BeautifulSoup from config import Config from confluent_kafka import Producer @@ -187,16 +189,16 @@ class AjouParser: # Ajou notices parser def parser(self): + context = ssl._create_unverified_context() try: - req = requests.get( - f"{self.ADDRESS}?mode=list&&articleLimit=10&article.offset=0" + result = urlopen( + f"{self.ADDRESS}?mode=list&&articleLimit={self.LENGTH}&article.offset=0", + context=context, ) - req.raise_for_status() - except requests.exceptions.ConnectionError: + except HTTPError: print("Seems like the server is down now.") return None, None, None, None - req.encoding = "utf-8" - html = req.text + html = result.read() soup = BeautifulSoup(html, "html.parser") ids = soup.select("table > tbody > tr > td.b-num-box") posts = soup.select("table > tbody > tr > td.b-td-left > div > a") diff --git a/python/tests/test_parser.py b/python/tests/test_parser.py index 92f757d..c08cc55 100644 --- a/python/tests/test_parser.py +++ b/python/tests/test_parser.py @@ -1,6 +1,7 @@ import json +import ssl -import requests +from urllib.request import urlopen from bs4 import BeautifulSoup @@ -24,9 +25,13 @@ def makeJson(postId, postTitle, postDate, postLink, postWriter): def parser(): - req = requests.get(f"{ADDRESS}?mode=list&&articleLimit={LENGTH}&article.offset=0") - req.encoding = "utf-8" - html = req.text + # req = requests.get(f"{ADDRESS}?mode=list&&articleLimit={LENGTH}&article.offset=0") + context = ssl._create_unverified_context() + result = urlopen( + f"{ADDRESS}?mode=list&&articleLimit={LENGTH}&article.offset=0", context=context + ) + + html = result.read() soup = BeautifulSoup(html, "html.parser") ids = soup.select("table > tbody > tr > td.b-num-box") posts = soup.select("table > tbody > tr > td.b-td-left > div > a") @@ -46,14 +51,14 @@ def test_parse(): assert len(writers) == 10, f"Check your parser: {writers}" for i in range(LENGTH): postTitle = posts[i].text.strip() - if FILTER_WORDS: - FILTERD = False - for filter in FILTER_WORDS: - if filter in postTitle: - FILTERD = True - break - if not FILTERD: - continue + # if FILTER_WORDS: + # FILTERD = False + # for filter in FILTER_WORDS: + # if filter in postTitle: + # FILTERD = True + # break + # if not FILTERD: + # continue postId = ids[i].text.strip() postLink = posts[i].get("href") -- GitLab