Skip to content
Snippets Groups Projects
Unverified Commit ec2cff9a authored by Seok Won's avatar Seok Won
Browse files

Update Ajou Notice Parser

Added a test which contains filtering function (it'll filter notices within the array of strings)

Made producer more safe and highthroughput.

Added old notices remover in producer.

Added a little bit of resting time between consuer polling.
parent 3915b0df
No related branches found
No related tags found
No related merge requests found
......@@ -31,6 +31,7 @@ try:
msg = c.poll(0.1)
time.sleep(5)
if msg is None:
time.sleep(10)
continue
elif not msg.error():
print("Received a message: {0}".format(msg.value()))
......
......@@ -17,7 +17,7 @@ def acked(err, msg):
if err is not None:
print("Failed to deliver message: {0}: {1}".format(msg.value(), err.str()))
else:
print("Message produced: {0}".format(msg.value())) # binary
print(f"Message produced: {0}...".format(msg.value()))
# Make data into dictionary format
......@@ -35,6 +35,24 @@ def makeJson(postId, postTitle, postDate, postLink, postWriter):
}
def checkOldness(jsonFile):
today = datetime.datetime.today()
today = datetime.datetime(today.year, today.month, today.day)
for post in list(jsonFile["POSTS"]):
currentDate = jsonFile["POSTS"][post]["DATE"] # string
savedDate = datetime.datetime.strptime(currentDate, "%y.%m.%d")
if (today - savedDate).days > MAXIMUM_DAY:
del jsonFile["POSTS"][post]
with open(JSON_PATH, "w+") as f:
f.write(json.dumps(jsonFile))
with open(JSON_PATH, "r+") as f:
read = json.load(f)
return read
# Ajou notices parser
def parser():
req = requests.get(f"{ADDRESS}?mode=list&&articleLimit=10&article.offset=0")
......@@ -55,6 +73,7 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
JSON_PATH = os.path.join(BASE_DIR, "already_read.json")
LENGTH = 10
PRODUCED = 0
MAXIMUM_DAY = 7 # remove notices in json that were posted more than 7days ago
DUMP = lambda x: json.dumps(x)
LOAD = lambda x: json.load(x)
......@@ -74,6 +93,8 @@ if not Path(JSON_PATH).is_file(): # 파일 없으면 기본 형식 만듬
with open(JSON_PATH, "r+") as f_read:
read = LOAD(f_read)
read = checkOldness(read)
# Set last parsed time to rest 1 hour well
LAST_PARSED = datetime.datetime.strptime(read["LAST_PARSED"], "%Y-%m-%d %H:%M:%S.%f")
......@@ -85,7 +106,16 @@ sc = WebClient(token)
channel = "C01G2CR5MEE" # 아주대
# Kafka Producer 만들기 "localhost:9092"
settings = {"bootstrap.servers": Config.MY_SERVER}
settings = {
"bootstrap.servers": Config.MY_SERVER,
# Safe Producer settings
"enable.idempotence": True,
"acks": "all",
"retries": 10000000,
"max.in.flight": 5,
"compression.type": "lz4",
"linger.ms": 5,
} # "enable.idempotence": True, "retries": 5
p = Producer(settings)
try:
......@@ -133,6 +163,7 @@ try:
print(f"Sent {PRODUCED} posts...")
else:
print("No new posts yet...")
print("Parsed:", datetime.datetime.now())
except SlackApiError as e:
assert e.response["ok"] is False
......
from confluent_kafka import Producer
from config import Config
from contextlib import contextmanager
p = Producer({"bootstrap.servers": Config.MY_SERVER})
p.produce(Config.TOPIC_ID, key="key_1", value="Hello")
p.flush(100)
@contextmanager
def prod(settings):
p = Producer(settings)
yield p
p.flush(100)
settings = {"bootstrap.servers": Config.MY_SERVER}
with prod(settings) as p:
p.produce(Config.TOPIC_ID, key="key_1", value="Hello")
# kafka-console-consumer --bootstrap-server localhost:9092 --topic first-topic
......@@ -2,10 +2,30 @@ import json
import os
from contextlib import contextmanager
from pathlib import Path
from datetime import datetime, timedelta
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
JSON_PATH = os.path.join(BASE_DIR, "test.json")
MAXIMUM_DAY = 6
def checkOldness(jsonFile):
today = datetime.today()
today = datetime(today.year, today.month, today.day)
for post in list(jsonFile["POSTS"]):
currentDate = jsonFile["POSTS"][post]["DATE"] # string
savedDate = datetime.strptime(currentDate, "%y.%m.%d")
if (today - savedDate).days > MAXIMUM_DAY:
print(f"removing {post}...")
del jsonFile["POSTS"][post]
with open(JSON_PATH, "w+") as f:
f.write(json.dumps(jsonFile))
with open(JSON_PATH, "r+") as f:
read = json.load(f)
return read
@contextmanager
......@@ -27,3 +47,17 @@ def test_open():
assert data is not None, "data is None."
def test_remove():
with open(JSON_PATH, "r+") as f_read:
read = json.load(f_read)
read = checkOldness(read)
today = datetime.today()
today = datetime(today.year, today.month, today.day)
old = today - timedelta(days=MAXIMUM_DAY)
for post in list(read["POSTS"]):
currentDate = read["POSTS"][post]["DATE"] # string
savedDate = datetime.strptime(currentDate, "%y.%m.%d")
assert savedDate > old, f"{MAXIMUM_DAY}일이 지난 공지는 제거되어야 함."
......@@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
ADDRESS = "https://www.ajou.ac.kr/kr/ajou/notice.do"
LENGTH = 10
FILTER_WORDS = ("설문", "기프트", "납부", "등록금") # only parse if notices contain these words
# Make data into dictionary format
def makeJson(postId, postTitle, postDate, postLink, postWriter):
......@@ -44,10 +45,18 @@ def test_parse():
assert len(dates) == 10, f"Check your parser: {dates}"
assert len(writers) == 10, f"Check your parser: {writers}"
for i in range(LENGTH):
postTitle = posts[i].text.strip()
if FILTER_WORDS:
FILTERD = False
for filter in FILTER_WORDS:
if filter in postTitle:
FILTERD = True
break
if not FILTERD:
continue
postId = ids[i].text.strip()
postLink = posts[i].get("href")
postTitle = posts[i].text.strip()
postDate = dates[i].text.strip()
postWriter = writers[i].text
assert int(postId) > 10000, f"postId is None."
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment