Skip to content
Snippets Groups Projects
Unverified Commit 41efdb4d authored by Seok Won's avatar Seok Won
Browse files

Update AjouNoticesParser

requests cause SSL Error.

For now, we change it to urlopen with unverifed context
parent 83a8306f
No related branches found
No related tags found
No related merge requests found
...@@ -3,8 +3,9 @@ import json ...@@ -3,8 +3,9 @@ import json
import os import os
import time import time
from pathlib import Path from pathlib import Path
from urllib.error import HTTPError
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from config import Config from config import Config
from confluent_kafka import Producer from confluent_kafka import Producer
...@@ -199,16 +200,16 @@ class AjouParserJSON: ...@@ -199,16 +200,16 @@ class AjouParserJSON:
# Ajou notices parser # Ajou notices parser
def parser(self): def parser(self):
context = ssl._create_unverified_context()
try: try:
req = requests.get( result = urlopen(
f"{self.ADDRESS}?mode=list&&articleLimit=10&article.offset=0" f"{self.ADDRESS}?mode=list&&articleLimit={self.LENGTH}&article.offset=0",
context=context,
) )
req.raise_for_status() except HTTPError:
except requests.exceptions.ConnectionError:
print("Seems like the server is down now.") print("Seems like the server is down now.")
return None, None, None, None return None, None, None, None
req.encoding = "utf-8" html = result.read()
html = req.text
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
ids = soup.select("table > tbody > tr > td.b-num-box") ids = soup.select("table > tbody > tr > td.b-num-box")
posts = soup.select("table > tbody > tr > td.b-td-left > div > a") posts = soup.select("table > tbody > tr > td.b-td-left > div > a")
......
import datetime import datetime
import json import json
import os import os
import ssl
import time import time
from urllib.error import HTTPError
from urllib.request import urlopen
import mysql.connector import mysql.connector
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from config import Config from config import Config
from confluent_kafka import Producer from confluent_kafka import Producer
...@@ -187,16 +189,16 @@ class AjouParser: ...@@ -187,16 +189,16 @@ class AjouParser:
# Ajou notices parser # Ajou notices parser
def parser(self): def parser(self):
context = ssl._create_unverified_context()
try: try:
req = requests.get( result = urlopen(
f"{self.ADDRESS}?mode=list&&articleLimit=10&article.offset=0" f"{self.ADDRESS}?mode=list&&articleLimit={self.LENGTH}&article.offset=0",
context=context,
) )
req.raise_for_status() except HTTPError:
except requests.exceptions.ConnectionError:
print("Seems like the server is down now.") print("Seems like the server is down now.")
return None, None, None, None return None, None, None, None
req.encoding = "utf-8" html = result.read()
html = req.text
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
ids = soup.select("table > tbody > tr > td.b-num-box") ids = soup.select("table > tbody > tr > td.b-num-box")
posts = soup.select("table > tbody > tr > td.b-td-left > div > a") posts = soup.select("table > tbody > tr > td.b-td-left > div > a")
......
import json import json
import ssl
import requests from urllib.request import urlopen
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
...@@ -24,9 +25,13 @@ def makeJson(postId, postTitle, postDate, postLink, postWriter): ...@@ -24,9 +25,13 @@ def makeJson(postId, postTitle, postDate, postLink, postWriter):
def parser(): def parser():
req = requests.get(f"{ADDRESS}?mode=list&&articleLimit={LENGTH}&article.offset=0") # req = requests.get(f"{ADDRESS}?mode=list&&articleLimit={LENGTH}&article.offset=0")
req.encoding = "utf-8" context = ssl._create_unverified_context()
html = req.text result = urlopen(
f"{ADDRESS}?mode=list&&articleLimit={LENGTH}&article.offset=0", context=context
)
html = result.read()
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
ids = soup.select("table > tbody > tr > td.b-num-box") ids = soup.select("table > tbody > tr > td.b-num-box")
posts = soup.select("table > tbody > tr > td.b-td-left > div > a") posts = soup.select("table > tbody > tr > td.b-td-left > div > a")
...@@ -46,14 +51,14 @@ def test_parse(): ...@@ -46,14 +51,14 @@ def test_parse():
assert len(writers) == 10, f"Check your parser: {writers}" assert len(writers) == 10, f"Check your parser: {writers}"
for i in range(LENGTH): for i in range(LENGTH):
postTitle = posts[i].text.strip() postTitle = posts[i].text.strip()
if FILTER_WORDS: # if FILTER_WORDS:
FILTERD = False # FILTERD = False
for filter in FILTER_WORDS: # for filter in FILTER_WORDS:
if filter in postTitle: # if filter in postTitle:
FILTERD = True # FILTERD = True
break # break
if not FILTERD: # if not FILTERD:
continue # continue
postId = ids[i].text.strip() postId = ids[i].text.strip()
postLink = posts[i].get("href") postLink = posts[i].get("href")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment