Skip to content
Snippets Groups Projects
Unverified Commit 3d33549b authored by Seok Won's avatar Seok Won
Browse files

Refactor AjouSlackProducer

using classes and optimizations.

more human readable
parent dcf9e14c
Branches
No related tags found
No related merge requests found
......@@ -9,163 +9,218 @@ from bs4 import BeautifulSoup
from config import Config
from confluent_kafka import Producer
# Producer callback function
def acked(err, msg):
if err is not None:
print("Failed to deliver message: {0}: {1}".format(msg.value(), err.str()))
else:
print("Message produced: {0}...".format(msg.value()))
DUMP = lambda x: json.dumps(x)
LOAD = lambda x: json.load(x)
# Make data into dictionary format
def makeJson(postId, postTitle, postDate, postLink, postWriter):
duplicate = "[" + postWriter + "]"
if duplicate in postTitle: # writer: [writer] title
postTitle = postTitle.replace(duplicate, "").strip() # -> writer: title
return {
postId: {
"TITLE": postTitle,
"DATE": postDate,
"LINK": ADDRESS + postLink,
"WRITER": postWriter,
}
}
class AjouParserJSON:
"""
Ajou notices Parser using Slack API and Apache Kafka (JSON)
def checkOldness(jsonFile):
today = datetime.datetime.today()
today = datetime.datetime(today.year, today.month, today.day)
for post in list(jsonFile["POSTS"]):
currentDate = jsonFile["POSTS"][post]["DATE"] # string
savedDate = datetime.datetime.strptime(currentDate, "%y.%m.%d")
if (today - savedDate).days > MAXIMUM_DAY:
del jsonFile["POSTS"][post]
JSON file will be saved in your current directory.
with open(JSON_PATH, "w+") as f:
f.write(json.dumps(jsonFile))
Methods
-------
run(server=Config.VM_SERVER, json_name="already_read.json")
with open(JSON_PATH, "r+") as f:
read = json.load(f)
Usage
-----
ajou = AjouParserJSON(Kafka_server_ip, json_path)
ajou.run()
"""
return read
# HTML
ADDRESS = "https://www.ajou.ac.kr/kr/ajou/notice.do"
LENGTH = 10
# JSON
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MAXIMUM_DAY = 7 # remove notices in json that were posted more than 7days ago
global DUMP, LOAD
# Ajou notices parser
def parser():
req = requests.get(f"{ADDRESS}?mode=list&&articleLimit=10&article.offset=0")
req.encoding = "utf-8"
html = req.text
soup = BeautifulSoup(html, "html.parser")
ids = soup.select("table > tbody > tr > td.b-num-box")
posts = soup.select("table > tbody > tr > td.b-td-left > div > a")
dates = soup.select("table > tbody > tr > td.b-td-left > div > div > span.b-date")
writers = soup.select(
"table > tbody > tr > td.b-td-left > div > div.b-m-con > span.b-writer"
)
return ids, posts, dates, writers
__slots__ = ("settings", "producer", "read")
def __init__(self, server=Config.VM_SERVER, json_name="already_read.json"):
print("Initializing...")
ADDRESS = "https://www.ajou.ac.kr/kr/ajou/notice.do"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
JSON_PATH = os.path.join(BASE_DIR, "already_read.json")
LENGTH = 10
PRODUCED = 0
MAXIMUM_DAY = 7 # remove notices in json that were posted more than 7days ago
DUMP = lambda x: json.dumps(x)
LOAD = lambda x: json.load(x)
self.JSON_PATH = os.path.join(self.BASE_DIR, json_name)
self.settings = { # Producer settings
"bootstrap.servers": server,
"enable.idempotence": True, # Safe
"acks": "all", # Safe
"retries": 5, # Safe
"max.in.flight": 5, # High throughput
"compression.type": "lz4", # High throughput
"linger.ms": 5, # High throughput
}
self.producer = Producer(self.settings)
# 공지 Parser
if not Path(JSON_PATH).is_file(): # 파일 없으면 기본 형식 만듬
if not Path(self.JSON_PATH).is_file(): # 파일 없으면 기본 형식 만듬
base_data = {"POSTS": {}, "LAST_PARSED": "1972-12-01 07:00:00.000000"}
with open(JSON_PATH, "a+") as f:
with open(self.JSON_PATH, "a+") as f:
f.write(DUMP(base_data))
read = None
self.read = None
# json read
with open(JSON_PATH, "r+") as f_read:
read = LOAD(f_read)
read = checkOldness(read)
# Kafka Producer 만들기
settings = {
"bootstrap.servers": Config.MY_SERVER,
# Safe Producer settings
"enable.idempotence": True,
"acks": "all",
"retries": 10000000,
"max.in.flight": 5,
"compression.type": "lz4",
"linger.ms": 5,
} # "enable.idempotence": True, "retries": 5
p = Producer(settings)
with open(self.JSON_PATH, "r+") as f_read:
self.read = LOAD(f_read)
self.read = self.checkOldness(self.read)
def run(self, period=3600): # period (second)
"""Check notices from html per period and sends data to Kafka Consumer."""
p = self.producer
read = self.read
try:
while True: # 1시간마다 무한 반복
PRODUCED = 0
print() # Section
PRODUCED = 0 # How many messages did it send?
LAST_PARSED = datetime.datetime.strptime(
read["LAST_PARSED"], "%Y-%m-%d %H:%M:%S.%f"
)
now = datetime.datetime.now()
now = self.getTimeNow()
diff = (now - LAST_PARSED).seconds
print("Last parsing:", LAST_PARSED)
if diff / 3600 < 1: # 업데이트 후 1시간이 안 지났음, 대기
print(f"Wait for {3600 - diff} seconds to sync new posts.")
time.sleep(3600 - diff)
read["LAST_PARSED"] = now.strftime("%Y-%m-%d %H:%M:%S.%f")
print("Last parsed at", LAST_PARSED)
if (diff / period) < 1: # 업데이트 후 period시간이 안 지났음, 대기
print(f"Wait for {period - diff} seconds to sync new posts.")
time.sleep(period - diff)
print("Trying to parse new posts...")
ids, posts, dates, writers = parser() # 다시 파싱
for i in range(LENGTH):
ids, posts, dates, writers = self.parser() # 다시 파싱
assert ids is not None, f"Check your parser: {ids}."
for i in range(self.LENGTH):
postId = ids[i].text.strip()
postLink = posts[i].get("href")
if postId in read["POSTS"]:
continue
postLink = self.ADDRESS + posts[i].get("href")
postTitle = posts[i].text.strip()
# postTitle = posts[i].get("title")
postDate = dates[i].text.strip()
postWriter = writers[i].text
data = makeJson(postId, postTitle, postDate, postLink, postWriter)
# {'10000': {'TITLE': '설문조사', 'DATE': '20.12.04', 'LINK': 'https', 'WRITER': '입학처'}}
# Removing a name duplication
duplicate = "[" + postWriter + "]"
if duplicate in postTitle: # writer: [writer] title
postTitle = postTitle.replace(
duplicate, ""
).strip() # -> writer: title
if postId not in read["POSTS"]:
print("Sending a new post...:", postId)
read["POSTS"].update(data)
kafkaData = self.makeData(
postId, postTitle, postDate, postLink, postWriter
)
print("\n>>> Sending a new post...:", postId)
PRODUCED += 1
p.produce(
Config.AJOU_TOPIC_ID, value=DUMP(data[postId]), callback=acked,
Config.AJOU_TOPIC_ID,
value=DUMP(kafkaData[postId]),
callback=self.acked,
)
p.poll(1) # 데이터 Kafka에게 전송
else:
continue
p.poll(1) # 데이터 Kafka에게 전송, second
read["LAST_PARSED"] = now.strftime("%Y-%m-%d %H:%M:%S.%f")
read["POSTS"].update(kafkaData)
if PRODUCED:
print(f"Sent {PRODUCED} post(s)...")
else:
print("No new posts yet...")
print("Parsed:", datetime.datetime.now())
print("\t** No new posts yet")
print("Parsed at", now)
with open(JSON_PATH, "w+") as f:
with open(self.JSON_PATH, "w+") as f:
f.write(DUMP(read))
with open(JSON_PATH, "r+") as f:
with open(self.JSON_PATH, "r+") as f:
read = LOAD(f)
print("Resting 1 hour...")
time.sleep(3600)
print(f"Resting {period // 3600} hour...")
time.sleep(period)
except Exception as e:
print(type(e))
except Exception as e: # General exceptions
print(dir(e))
except KeyboardInterrupt:
print("Pressed CTRL+C...")
finally:
print("Exiting...")
print("\nExiting...")
p.flush(100)
# Producer callback function
@staticmethod
def acked(err, msg):
if err is not None:
print(
"\t** Failed to deliver message: {0}: {1}".format(
msg.value(), err.str()
)
)
else:
print("Message produced correctly...")
@staticmethod
def makeData(postId, postTitle, postDate, postLink, postWriter):
return {
postId: {
"TITLE": postTitle,
"DATE": postDate,
"LINK": postLink,
"WRITER": postWriter,
}
}
def checkOldness(self, jsonFile):
today = datetime.datetime.today()
today = datetime.datetime(today.year, today.month, today.day)
for post in list(jsonFile["POSTS"]):
currentDate = jsonFile["POSTS"][post]["DATE"] # string
savedDate = datetime.datetime.strptime(currentDate, "%y.%m.%d")
if (today - savedDate).days > self.MAXIMUM_DAY:
del jsonFile["POSTS"][post]
with open(self.JSON_PATH, "w+") as f:
f.write(json.dumps(jsonFile))
with open(self.JSON_PATH, "r+") as f:
read = json.load(f)
return read
@staticmethod
def getTimeNow() -> datetime.datetime:
return datetime.datetime.now()
# Ajou notices parser
def parser(self):
try:
req = requests.get(
f"{self.ADDRESS}?mode=list&&articleLimit=10&article.offset=0"
)
req.raise_for_status()
except requests.exceptions.ConnectionError:
print("Seems like the server is down now.")
return None, None, None, None
req.encoding = "utf-8"
html = req.text
soup = BeautifulSoup(html, "html.parser")
ids = soup.select("table > tbody > tr > td.b-num-box")
posts = soup.select("table > tbody > tr > td.b-td-left > div > a")
dates = soup.select(
"table > tbody > tr > td.b-td-left > div > div > span.b-date"
)
writers = soup.select(
"table > tbody > tr > td.b-td-left > div > div.b-m-con > span.b-writer"
)
return ids, posts, dates, writers
if __name__ == "__main__":
ajou = AjouParserJSON()
ajou.run()
......@@ -12,11 +12,11 @@ from confluent_kafka import Producer
class AjouParser:
"""
Ajou notices Parser using Slack API and Apache Kafka
Ajou notices Parser using Slack API and Apache Kafka (MySQL)
Methods
-------
run(server=Config.VM_SERVER, channel="C01G2CR5MEE", database="ajou_notices")
run(server=Config.VM_SERVER, database="ajou_notices")
Usage
-----
......@@ -55,7 +55,7 @@ class AjouParser:
"bootstrap.servers": server,
"enable.idempotence": True, # Safe
"acks": "all", # Safe
"retries": 10000000, # Safe
"retries": 5, # Safe
"max.in.flight": 5, # High throughput
"compression.type": "lz4", # High throughput
"linger.ms": 5, # High throughput
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment