Skip to content
Snippets Groups Projects
Unverified Commit 41efdb4d authored by Seok Won's avatar Seok Won
Browse files

Update AjouNoticesParser

requests cause SSL Error.

For now, we change it to urlopen with unverifed context
parent 83a8306f
No related branches found
No related tags found
No related merge requests found
......@@ -3,8 +3,9 @@ import json
import os
import time
from pathlib import Path
from urllib.error import HTTPError
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
from config import Config
from confluent_kafka import Producer
......@@ -199,16 +200,16 @@ class AjouParserJSON:
# Ajou notices parser
def parser(self):
context = ssl._create_unverified_context()
try:
req = requests.get(
f"{self.ADDRESS}?mode=list&&articleLimit=10&article.offset=0"
result = urlopen(
f"{self.ADDRESS}?mode=list&&articleLimit={self.LENGTH}&article.offset=0",
context=context,
)
req.raise_for_status()
except requests.exceptions.ConnectionError:
except HTTPError:
print("Seems like the server is down now.")
return None, None, None, None
req.encoding = "utf-8"
html = req.text
html = result.read()
soup = BeautifulSoup(html, "html.parser")
ids = soup.select("table > tbody > tr > td.b-num-box")
posts = soup.select("table > tbody > tr > td.b-td-left > div > a")
......
import datetime
import json
import os
import ssl
import time
from urllib.error import HTTPError
from urllib.request import urlopen
import mysql.connector
import requests
from bs4 import BeautifulSoup
from config import Config
from confluent_kafka import Producer
......@@ -187,16 +189,16 @@ class AjouParser:
# Ajou notices parser
def parser(self):
context = ssl._create_unverified_context()
try:
req = requests.get(
f"{self.ADDRESS}?mode=list&&articleLimit=10&article.offset=0"
result = urlopen(
f"{self.ADDRESS}?mode=list&&articleLimit={self.LENGTH}&article.offset=0",
context=context,
)
req.raise_for_status()
except requests.exceptions.ConnectionError:
except HTTPError:
print("Seems like the server is down now.")
return None, None, None, None
req.encoding = "utf-8"
html = req.text
html = result.read()
soup = BeautifulSoup(html, "html.parser")
ids = soup.select("table > tbody > tr > td.b-num-box")
posts = soup.select("table > tbody > tr > td.b-td-left > div > a")
......
import json
import ssl
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
......@@ -24,9 +25,13 @@ def makeJson(postId, postTitle, postDate, postLink, postWriter):
def parser():
req = requests.get(f"{ADDRESS}?mode=list&&articleLimit={LENGTH}&article.offset=0")
req.encoding = "utf-8"
html = req.text
# req = requests.get(f"{ADDRESS}?mode=list&&articleLimit={LENGTH}&article.offset=0")
context = ssl._create_unverified_context()
result = urlopen(
f"{ADDRESS}?mode=list&&articleLimit={LENGTH}&article.offset=0", context=context
)
html = result.read()
soup = BeautifulSoup(html, "html.parser")
ids = soup.select("table > tbody > tr > td.b-num-box")
posts = soup.select("table > tbody > tr > td.b-td-left > div > a")
......@@ -46,14 +51,14 @@ def test_parse():
assert len(writers) == 10, f"Check your parser: {writers}"
for i in range(LENGTH):
postTitle = posts[i].text.strip()
if FILTER_WORDS:
FILTERD = False
for filter in FILTER_WORDS:
if filter in postTitle:
FILTERD = True
break
if not FILTERD:
continue
# if FILTER_WORDS:
# FILTERD = False
# for filter in FILTER_WORDS:
# if filter in postTitle:
# FILTERD = True
# break
# if not FILTERD:
# continue
postId = ids[i].text.strip()
postLink = posts[i].get("href")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment