diff --git a/python/src/AjouSlackConsumer.py b/python/src/AjouSlackConsumer.py index 60f60d8543a4cce5ff6a2c36933081ea1ed236e7..1694137d390fe115564b8d7191e0291110aaa187 100644 --- a/python/src/AjouSlackConsumer.py +++ b/python/src/AjouSlackConsumer.py @@ -31,6 +31,7 @@ try: msg = c.poll(0.1) time.sleep(5) if msg is None: + time.sleep(10) continue elif not msg.error(): print("Received a message: {0}".format(msg.value())) diff --git a/python/src/AjouSlackProducer.py b/python/src/AjouSlackProducer.py index 3706dec1ad350989e83a95445f07ece7e85f710b..de7970d34a94e9b137f0f172e5da9dd1e6d8c83f 100644 --- a/python/src/AjouSlackProducer.py +++ b/python/src/AjouSlackProducer.py @@ -17,7 +17,7 @@ def acked(err, msg): if err is not None: print("Failed to deliver message: {0}: {1}".format(msg.value(), err.str())) else: - print("Message produced: {0}".format(msg.value())) # binary + print(f"Message produced: {0}...".format(msg.value())) # Make data into dictionary format @@ -35,6 +35,24 @@ def makeJson(postId, postTitle, postDate, postLink, postWriter): } +def checkOldness(jsonFile): + today = datetime.datetime.today() + today = datetime.datetime(today.year, today.month, today.day) + for post in list(jsonFile["POSTS"]): + currentDate = jsonFile["POSTS"][post]["DATE"] # string + savedDate = datetime.datetime.strptime(currentDate, "%y.%m.%d") + if (today - savedDate).days > MAXIMUM_DAY: + del jsonFile["POSTS"][post] + + with open(JSON_PATH, "w+") as f: + f.write(json.dumps(jsonFile)) + + with open(JSON_PATH, "r+") as f: + read = json.load(f) + + return read + + # Ajou notices parser def parser(): req = requests.get(f"{ADDRESS}?mode=list&&articleLimit=10&article.offset=0") @@ -55,6 +73,7 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) JSON_PATH = os.path.join(BASE_DIR, "already_read.json") LENGTH = 10 PRODUCED = 0 +MAXIMUM_DAY = 7 # remove notices in json that were posted more than 7days ago DUMP = lambda x: json.dumps(x) LOAD = lambda x: json.load(x) @@ -74,6 +93,8 @@ if not Path(JSON_PATH).is_file(): # 파일 없으면 기본 형식 만듬 with open(JSON_PATH, "r+") as f_read: read = LOAD(f_read) +read = checkOldness(read) + # Set last parsed time to rest 1 hour well LAST_PARSED = datetime.datetime.strptime(read["LAST_PARSED"], "%Y-%m-%d %H:%M:%S.%f") @@ -85,7 +106,16 @@ sc = WebClient(token) channel = "C01G2CR5MEE" # 아주대 # Kafka Producer 만들기 "localhost:9092" -settings = {"bootstrap.servers": Config.MY_SERVER} +settings = { + "bootstrap.servers": Config.MY_SERVER, + # Safe Producer settings + "enable.idempotence": True, + "acks": "all", + "retries": 10000000, + "max.in.flight": 5, + "compression.type": "lz4", + "linger.ms": 5, +} # "enable.idempotence": True, "retries": 5 p = Producer(settings) try: @@ -133,6 +163,7 @@ try: print(f"Sent {PRODUCED} posts...") else: print("No new posts yet...") + print("Parsed:", datetime.datetime.now()) except SlackApiError as e: assert e.response["ok"] is False diff --git a/python/src/ProducerDemo.py b/python/src/ProducerDemo.py index 9a8e8d68f74b26cf2942d66d21abf13b0b22be82..910e9eadaf5c56673ef7a356c7f3e40b742b6397 100644 --- a/python/src/ProducerDemo.py +++ b/python/src/ProducerDemo.py @@ -1,8 +1,18 @@ from confluent_kafka import Producer from config import Config +from contextlib import contextmanager -p = Producer({"bootstrap.servers": Config.MY_SERVER}) -p.produce(Config.TOPIC_ID, key="key_1", value="Hello") -p.flush(100) + +@contextmanager +def prod(settings): + p = Producer(settings) + yield p + p.flush(100) + + +settings = {"bootstrap.servers": Config.MY_SERVER} + +with prod(settings) as p: + p.produce(Config.TOPIC_ID, key="key_1", value="Hello") # kafka-console-consumer --bootstrap-server localhost:9092 --topic first-topic diff --git a/python/tests/test_open.py b/python/tests/test_open.py index ac5806389306f6d7c9ccc70c0ea909d786b023ff..82d0ba2babc41db17313558307ec2c60ee89da9b 100644 --- a/python/tests/test_open.py +++ b/python/tests/test_open.py @@ -2,10 +2,30 @@ import json import os from contextlib import contextmanager from pathlib import Path - +from datetime import datetime, timedelta BASE_DIR = os.path.dirname(os.path.abspath(__file__)) JSON_PATH = os.path.join(BASE_DIR, "test.json") +MAXIMUM_DAY = 6 + + +def checkOldness(jsonFile): + today = datetime.today() + today = datetime(today.year, today.month, today.day) + for post in list(jsonFile["POSTS"]): + currentDate = jsonFile["POSTS"][post]["DATE"] # string + savedDate = datetime.strptime(currentDate, "%y.%m.%d") + if (today - savedDate).days > MAXIMUM_DAY: + print(f"removing {post}...") + del jsonFile["POSTS"][post] + + with open(JSON_PATH, "w+") as f: + f.write(json.dumps(jsonFile)) + + with open(JSON_PATH, "r+") as f: + read = json.load(f) + + return read @contextmanager @@ -27,3 +47,17 @@ def test_open(): assert data is not None, "data is None." + +def test_remove(): + with open(JSON_PATH, "r+") as f_read: + read = json.load(f_read) + read = checkOldness(read) + + today = datetime.today() + today = datetime(today.year, today.month, today.day) + old = today - timedelta(days=MAXIMUM_DAY) + + for post in list(read["POSTS"]): + currentDate = read["POSTS"][post]["DATE"] # string + savedDate = datetime.strptime(currentDate, "%y.%m.%d") + assert savedDate > old, f"{MAXIMUM_DAY}일이 지난 공지는 제거되어야 함." diff --git a/python/tests/test_parser.py b/python/tests/test_parser.py index 736a684f2cf01a725205ff3618ab0f35c8e3e22b..92f757d37e7768ae9d935ded63869eef51ab8a74 100644 --- a/python/tests/test_parser.py +++ b/python/tests/test_parser.py @@ -6,6 +6,7 @@ from bs4 import BeautifulSoup ADDRESS = "https://www.ajou.ac.kr/kr/ajou/notice.do" LENGTH = 10 +FILTER_WORDS = ("설문", "기프트", "납부", "등록금") # only parse if notices contain these words # Make data into dictionary format def makeJson(postId, postTitle, postDate, postLink, postWriter): @@ -44,10 +45,18 @@ def test_parse(): assert len(dates) == 10, f"Check your parser: {dates}" assert len(writers) == 10, f"Check your parser: {writers}" for i in range(LENGTH): + postTitle = posts[i].text.strip() + if FILTER_WORDS: + FILTERD = False + for filter in FILTER_WORDS: + if filter in postTitle: + FILTERD = True + break + if not FILTERD: + continue postId = ids[i].text.strip() postLink = posts[i].get("href") - postTitle = posts[i].text.strip() postDate = dates[i].text.strip() postWriter = writers[i].text assert int(postId) > 10000, f"postId is None."