hc-spider/hc_spider/json_logger.py

82 lines
2.9 KiB
Python

import json
import os
import threading
import time
import warnings
from typing import Callable, Union
from hc_spider.model import SharedObjects
class JSONLogger(threading.Thread):
logging_interval: int
_shared_objects: SharedObjects
def __init__(self, **kwargs) -> None:
self._shared_objects = SharedObjects(**kwargs)
self.logging_interval = self._shared_objects.config.get("logging_interval")
super().__init__()
self.daemon = True
self.name = "JSONLogger"
def start(self) -> None:
if os.path.exists(self._shared_objects.config.get("log_dir")) is False:
os.mkdir(self._shared_objects.config.get("log_dir"))
super().start()
def run(self) -> None:
print(f"[{self.name}] is starting")
while self._shared_objects.shutdown_event.is_set() is False:
time.sleep(self.logging_interval)
try:
self._dump_logs()
except RuntimeError as e:
print(f"[{self.name}] RuntimeError: {e}, continuing execution")
continue
print(f"[{self.name}] is shutting down", flush=True)
self._shutdown()
def _shutdown(self) -> None:
try:
self._shared_objects.worker_finished.wait(timeout=10)
except threading.BrokenBarrierError:
warnings.warn(f"{self.name}: Workers are still running, log files can be corrupted")
self._dump_logs()
def _dump_logs(self) -> None:
self._dump_not_valid_urls()
self._dump_otc_related_urls()
self._dump_urls_with_error()
if self._shared_objects.shutdown_event.is_set() is True:
self._dump_visited()
def _dump_data_to_file(self, data: list | dict, filename: str, sort_key: Union[Callable, None]) -> None:
if self._shared_objects.shutdown_event.is_set() is True:
data.sort(key=sort_key)
with open(f"{self._shared_objects.config.get('log_dir')}/{filename}.json", "w") as f:
f.write(json.dumps(data, indent=4))
def _dump_not_valid_urls(self) -> None:
data = list(self._shared_objects.not_valid_urls)
self._dump_data_to_file(data=data, filename="not_valid_urls", sort_key=lambda x: x.get("url"))
def _dump_visited(self) -> None:
data = list(self._shared_objects.visited_nodes)
self._dump_data_to_file(data=data, filename="visited_urls", sort_key=None)
def _dump_otc_related_urls(self) -> None:
data = [i for i in self._shared_objects.not_valid_urls if "otc" in i.get("url", "")]
self._dump_data_to_file(data=data, filename="otc_not_valid_urls", sort_key=lambda x: x.get("url"))
def _dump_urls_with_error(self) -> None:
data = list(self._shared_objects.urls_with_error)
self._dump_data_to_file(data=data, filename="urls_with_error", sort_key=lambda x: x.get("error").split()[0])
def __del__(self) -> None:
print(f"[{self.name}] exited", flush=True)