Skip to content
Snippets Groups Projects
Unverified Commit 3d33549b authored by Seok Won's avatar Seok Won
Browse files

Refactor AjouSlackProducer

using classes and optimizations.

more human readable
parent dcf9e14c
Branches
No related tags found
No related merge requests found
...@@ -9,163 +9,218 @@ from bs4 import BeautifulSoup ...@@ -9,163 +9,218 @@ from bs4 import BeautifulSoup
from config import Config from config import Config
from confluent_kafka import Producer from confluent_kafka import Producer
# Producer callback function
def acked(err, msg):
if err is not None:
print("Failed to deliver message: {0}: {1}".format(msg.value(), err.str()))
else:
print("Message produced: {0}...".format(msg.value()))
DUMP = lambda x: json.dumps(x)
LOAD = lambda x: json.load(x)
# Make data into dictionary format
def makeJson(postId, postTitle, postDate, postLink, postWriter):
duplicate = "[" + postWriter + "]"
if duplicate in postTitle: # writer: [writer] title
postTitle = postTitle.replace(duplicate, "").strip() # -> writer: title
return {
postId: {
"TITLE": postTitle,
"DATE": postDate,
"LINK": ADDRESS + postLink,
"WRITER": postWriter,
}
}
class AjouParserJSON:
"""
Ajou notices Parser using Slack API and Apache Kafka (JSON)
def checkOldness(jsonFile): JSON file will be saved in your current directory.
today = datetime.datetime.today()
today = datetime.datetime(today.year, today.month, today.day)
for post in list(jsonFile["POSTS"]):
currentDate = jsonFile["POSTS"][post]["DATE"] # string
savedDate = datetime.datetime.strptime(currentDate, "%y.%m.%d")
if (today - savedDate).days > MAXIMUM_DAY:
del jsonFile["POSTS"][post]
with open(JSON_PATH, "w+") as f: Methods
f.write(json.dumps(jsonFile)) -------
run(server=Config.VM_SERVER, json_name="already_read.json")
with open(JSON_PATH, "r+") as f: Usage
read = json.load(f) -----
ajou = AjouParserJSON(Kafka_server_ip, json_path)
ajou.run()
"""
return read # HTML
ADDRESS = "https://www.ajou.ac.kr/kr/ajou/notice.do"
LENGTH = 10
# JSON
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MAXIMUM_DAY = 7 # remove notices in json that were posted more than 7days ago
global DUMP, LOAD
# Ajou notices parser __slots__ = ("settings", "producer", "read")
def parser():
req = requests.get(f"{ADDRESS}?mode=list&&articleLimit=10&article.offset=0")
req.encoding = "utf-8"
html = req.text
soup = BeautifulSoup(html, "html.parser")
ids = soup.select("table > tbody > tr > td.b-num-box")
posts = soup.select("table > tbody > tr > td.b-td-left > div > a")
dates = soup.select("table > tbody > tr > td.b-td-left > div > div > span.b-date")
writers = soup.select(
"table > tbody > tr > td.b-td-left > div > div.b-m-con > span.b-writer"
)
return ids, posts, dates, writers
def __init__(self, server=Config.VM_SERVER, json_name="already_read.json"):
print("Initializing...")
ADDRESS = "https://www.ajou.ac.kr/kr/ajou/notice.do" self.JSON_PATH = os.path.join(self.BASE_DIR, json_name)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
JSON_PATH = os.path.join(BASE_DIR, "already_read.json")
LENGTH = 10
PRODUCED = 0
MAXIMUM_DAY = 7 # remove notices in json that were posted more than 7days ago
DUMP = lambda x: json.dumps(x)
LOAD = lambda x: json.load(x)
self.settings = { # Producer settings
"bootstrap.servers": server,
"enable.idempotence": True, # Safe
"acks": "all", # Safe
"retries": 5, # Safe
"max.in.flight": 5, # High throughput
"compression.type": "lz4", # High throughput
"linger.ms": 5, # High throughput
}
self.producer = Producer(self.settings)
# 공지 Parser # 공지 Parser
if not Path(JSON_PATH).is_file(): # 파일 없으면 기본 형식 만듬 if not Path(self.JSON_PATH).is_file(): # 파일 없으면 기본 형식 만듬
base_data = {"POSTS": {}, "LAST_PARSED": "1972-12-01 07:00:00.000000"} base_data = {"POSTS": {}, "LAST_PARSED": "1972-12-01 07:00:00.000000"}
with open(JSON_PATH, "a+") as f: with open(self.JSON_PATH, "a+") as f:
f.write(DUMP(base_data)) f.write(DUMP(base_data))
read = None self.read = None
# json read # json read
with open(JSON_PATH, "r+") as f_read: with open(self.JSON_PATH, "r+") as f_read:
read = LOAD(f_read) self.read = LOAD(f_read)
read = checkOldness(read) self.read = self.checkOldness(self.read)
# Kafka Producer 만들기 def run(self, period=3600): # period (second)
settings = { """Check notices from html per period and sends data to Kafka Consumer."""
"bootstrap.servers": Config.MY_SERVER, p = self.producer
# Safe Producer settings read = self.read
"enable.idempotence": True,
"acks": "all",
"retries": 10000000,
"max.in.flight": 5,
"compression.type": "lz4",
"linger.ms": 5,
} # "enable.idempotence": True, "retries": 5
p = Producer(settings)
try: try:
while True: # 1시간마다 무한 반복 while True: # 1시간마다 무한 반복
PRODUCED = 0 print() # Section
PRODUCED = 0 # How many messages did it send?
LAST_PARSED = datetime.datetime.strptime( LAST_PARSED = datetime.datetime.strptime(
read["LAST_PARSED"], "%Y-%m-%d %H:%M:%S.%f" read["LAST_PARSED"], "%Y-%m-%d %H:%M:%S.%f"
) )
now = datetime.datetime.now() now = self.getTimeNow()
diff = (now - LAST_PARSED).seconds diff = (now - LAST_PARSED).seconds
print("Last parsing:", LAST_PARSED)
if diff / 3600 < 1: # 업데이트 후 1시간이 안 지났음, 대기
print(f"Wait for {3600 - diff} seconds to sync new posts.")
time.sleep(3600 - diff)
read["LAST_PARSED"] = now.strftime("%Y-%m-%d %H:%M:%S.%f") print("Last parsed at", LAST_PARSED)
if (diff / period) < 1: # 업데이트 후 period시간이 안 지났음, 대기
print(f"Wait for {period - diff} seconds to sync new posts.")
time.sleep(period - diff)
print("Trying to parse new posts...") print("Trying to parse new posts...")
ids, posts, dates, writers = parser() # 다시 파싱 ids, posts, dates, writers = self.parser() # 다시 파싱
for i in range(LENGTH): assert ids is not None, f"Check your parser: {ids}."
for i in range(self.LENGTH):
postId = ids[i].text.strip() postId = ids[i].text.strip()
postLink = posts[i].get("href") if postId in read["POSTS"]:
continue
postLink = self.ADDRESS + posts[i].get("href")
postTitle = posts[i].text.strip() postTitle = posts[i].text.strip()
# postTitle = posts[i].get("title")
postDate = dates[i].text.strip() postDate = dates[i].text.strip()
postWriter = writers[i].text postWriter = writers[i].text
data = makeJson(postId, postTitle, postDate, postLink, postWriter) # Removing a name duplication
# {'10000': {'TITLE': '설문조사', 'DATE': '20.12.04', 'LINK': 'https', 'WRITER': '입학처'}} duplicate = "[" + postWriter + "]"
if duplicate in postTitle: # writer: [writer] title
postTitle = postTitle.replace(
duplicate, ""
).strip() # -> writer: title
if postId not in read["POSTS"]: kafkaData = self.makeData(
print("Sending a new post...:", postId) postId, postTitle, postDate, postLink, postWriter
read["POSTS"].update(data) )
print("\n>>> Sending a new post...:", postId)
PRODUCED += 1 PRODUCED += 1
p.produce( p.produce(
Config.AJOU_TOPIC_ID, value=DUMP(data[postId]), callback=acked, Config.AJOU_TOPIC_ID,
value=DUMP(kafkaData[postId]),
callback=self.acked,
) )
p.poll(1) # 데이터 Kafka에게 전송 p.poll(1) # 데이터 Kafka에게 전송, second
else:
continue read["LAST_PARSED"] = now.strftime("%Y-%m-%d %H:%M:%S.%f")
read["POSTS"].update(kafkaData)
if PRODUCED: if PRODUCED:
print(f"Sent {PRODUCED} post(s)...") print(f"Sent {PRODUCED} post(s)...")
else: else:
print("No new posts yet...") print("\t** No new posts yet")
print("Parsed:", datetime.datetime.now()) print("Parsed at", now)
with open(JSON_PATH, "w+") as f: with open(self.JSON_PATH, "w+") as f:
f.write(DUMP(read)) f.write(DUMP(read))
with open(JSON_PATH, "r+") as f: with open(self.JSON_PATH, "r+") as f:
read = LOAD(f) read = LOAD(f)
print("Resting 1 hour...") print(f"Resting {period // 3600} hour...")
time.sleep(period)
time.sleep(3600)
except Exception as e: except Exception as e: # General exceptions
print(type(e))
print(dir(e)) print(dir(e))
except KeyboardInterrupt: except KeyboardInterrupt:
print("Pressed CTRL+C...") print("Pressed CTRL+C...")
finally: finally:
print("Exiting...") print("\nExiting...")
p.flush(100) p.flush(100)
# Producer callback function
@staticmethod
def acked(err, msg):
if err is not None:
print(
"\t** Failed to deliver message: {0}: {1}".format(
msg.value(), err.str()
)
)
else:
print("Message produced correctly...")
@staticmethod
def makeData(postId, postTitle, postDate, postLink, postWriter):
return {
postId: {
"TITLE": postTitle,
"DATE": postDate,
"LINK": postLink,
"WRITER": postWriter,
}
}
def checkOldness(self, jsonFile):
today = datetime.datetime.today()
today = datetime.datetime(today.year, today.month, today.day)
for post in list(jsonFile["POSTS"]):
currentDate = jsonFile["POSTS"][post]["DATE"] # string
savedDate = datetime.datetime.strptime(currentDate, "%y.%m.%d")
if (today - savedDate).days > self.MAXIMUM_DAY:
del jsonFile["POSTS"][post]
with open(self.JSON_PATH, "w+") as f:
f.write(json.dumps(jsonFile))
with open(self.JSON_PATH, "r+") as f:
read = json.load(f)
return read
@staticmethod
def getTimeNow() -> datetime.datetime:
return datetime.datetime.now()
# Ajou notices parser
def parser(self):
try:
req = requests.get(
f"{self.ADDRESS}?mode=list&&articleLimit=10&article.offset=0"
)
req.raise_for_status()
except requests.exceptions.ConnectionError:
print("Seems like the server is down now.")
return None, None, None, None
req.encoding = "utf-8"
html = req.text
soup = BeautifulSoup(html, "html.parser")
ids = soup.select("table > tbody > tr > td.b-num-box")
posts = soup.select("table > tbody > tr > td.b-td-left > div > a")
dates = soup.select(
"table > tbody > tr > td.b-td-left > div > div > span.b-date"
)
writers = soup.select(
"table > tbody > tr > td.b-td-left > div > div.b-m-con > span.b-writer"
)
return ids, posts, dates, writers
if __name__ == "__main__":
ajou = AjouParserJSON()
ajou.run()
...@@ -12,11 +12,11 @@ from confluent_kafka import Producer ...@@ -12,11 +12,11 @@ from confluent_kafka import Producer
class AjouParser: class AjouParser:
""" """
Ajou notices Parser using Slack API and Apache Kafka Ajou notices Parser using Slack API and Apache Kafka (MySQL)
Methods Methods
------- -------
run(server=Config.VM_SERVER, channel="C01G2CR5MEE", database="ajou_notices") run(server=Config.VM_SERVER, database="ajou_notices")
Usage Usage
----- -----
...@@ -55,7 +55,7 @@ class AjouParser: ...@@ -55,7 +55,7 @@ class AjouParser:
"bootstrap.servers": server, "bootstrap.servers": server,
"enable.idempotence": True, # Safe "enable.idempotence": True, # Safe
"acks": "all", # Safe "acks": "all", # Safe
"retries": 10000000, # Safe "retries": 5, # Safe
"max.in.flight": 5, # High throughput "max.in.flight": 5, # High throughput
"compression.type": "lz4", # High throughput "compression.type": "lz4", # High throughput
"linger.ms": 5, # High throughput "linger.ms": 5, # High throughput
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment