Initial commit.

2023-07-11 04:55:36 +00:00 · 2023-07-11 04:55:36 +00:00 · 04ea9c9e5e
commit 04ea9c9e5e
8 changed files with 464 additions and 0 deletions
--- a/batcher.py
+++ b/batcher.py
@ -0,0 +1,220 @@
 import atexit
 import os
 import pickle
 import threading
 import time
 from collections import deque
 from concurrent import futures
 from itertools import zip_longest
 import grpc
 import numpy as np
 import psycopg2
 import requests
 import schedule
 import yaml
 from fake_useragent import UserAgent
 import id_service_pb2
 import id_service_pb2_grpc
 DIR = os.path.dirname(os.path.realpath(__file__))
 BATCHER_STATE_FILE = os.path.join(DIR, "batcher_state.pkl")
 SERVICE_STATE_FILE = os.path.join(DIR, "id_service_state.pkl")
 with open(os.path.join(DIR, "batcher_config.yaml")) as f:
    config = yaml.load(f, yaml.FullLoader)
 class IDBatcher:
    def __init__(self):
        self.lock = threading.Lock()
        self.ids_queue = deque()
        self.ua = UserAgent()
        # Load state from file, if exists.
        if os.path.exists(BATCHER_STATE_FILE):
            with open(BATCHER_STATE_FILE, "rb") as f:
                self.last_post_id, self.last_comment_id, self.ids_queue = pickle.load(f)
        else:
            self.last_post_id = self.fetch_newest_id("/r/all/new.json")
            self.last_comment_id = self.fetch_newest_id("/r/all/comments.json")
            self.save_state()
        self.fetch_new_ids()
        schedule.every(5).seconds.do(self.fetch_new_ids)
        thread = threading.Thread(target=self.start_scheduler)
        thread.start()
    def start_scheduler(self):
        while True:
            schedule.run_pending()
            time.sleep(1)
    def fetch_newest_id(self, endpoint):
        reddit_id = requests.get(
            f"https://www.reddit.com{endpoint}",
            headers={"User-Agent": self.ua.random},
        ).json()["data"]["children"][0]["data"]["name"]
        return self.reddit_id_to_int(reddit_id)
    def fetch_new_ids(self):
        with self.lock:
            new_post_id = self.fetch_newest_id("/r/all/new.json")
            new_comment_id = self.fetch_newest_id("/r/all/comments.json")
            new_post_ids = [
                self.int_to_reddit_id("t3", i)
                for i in range(self.last_post_id + 1, new_post_id + 1)
            ]
            new_comment_ids = [
                self.int_to_reddit_id("t1", i)
                for i in range(self.last_comment_id + 1, new_comment_id + 1)
            ]
            self.ids_queue.extend(self.interleave(new_post_ids, new_comment_ids))
            self.last_post_id = new_post_id
            self.last_comment_id = new_comment_id
            print("Queue size:", len(self.ids_queue))
            self.save_state()
    def get_batch(self):
        with self.lock:
            batch = [
                self.ids_queue.popleft() for _ in range(min(100, len(self.ids_queue)))
            ]
        return batch
    def save_state(self):
        with open(BATCHER_STATE_FILE, "wb") as f:
            pickle.dump((self.last_post_id, self.last_comment_id, self.ids_queue), f)
    @staticmethod
    def reddit_id_to_int(reddit_id):
        _prefix, base36 = reddit_id.split("_")
        return int(base36, 36)
    @staticmethod
    def int_to_reddit_id(id_type, i):
        id_b36 = np.base_repr(i, 36)
        return f"{id_type}_{id_b36}".lower()
    @staticmethod
    def interleave(list1, list2):
        return [
            item
            for pair in zip_longest(list1, list2)
            for item in pair
            if item is not None
        ]
 class IDService(id_service_pb2_grpc.IDServiceServicer):
    def __init__(self, batcher):
        self.batcher = batcher
        self.active_batches = {}
        self.timers = {}
        self.lock = threading.Lock()
        self.conn = psycopg2.connect(
            f"dbname={config['db']['name']} "
            f"user={config['db']['user']} "
            f"password={config['db']['password']}"
        )
        self.cur = self.conn.cursor()
        self.cur.execute(
            """
            CREATE TABLE IF NOT EXISTS reddit_data(
                id SERIAL PRIMARY KEY,
                data JSONB NOT NULL
            )
        """
        )
        self.conn.commit()
        if os.path.exists(SERVICE_STATE_FILE):
            with open(SERVICE_STATE_FILE, "rb") as f:
                self.active_batches = pickle.load(f)
    def GetBatch(self, request, context):
        client_id = request.client_id
        with self.lock:
            # Check if there is an unconfirmed batch for the client.
            if client_id in self.active_batches:
                batch = self.active_batches[client_id]
            else:
                batch = self.batcher.get_batch()
                self.active_batches[client_id] = batch
            # Cancel any existing timer for this client.
            if client_id in self.timers:
                self.timers[client_id].cancel()
            # Start a new timer for the client.
            self.timers[client_id] = threading.Timer(
                120, self.return_batch, [client_id]
            )
            self.timers[client_id].start()
        return id_service_pb2.BatchResponse(ids=batch)
    def SubmitBatch(self, request, context):
        client_id = request.client_id
        with self.lock:
            if client_id in self.active_batches:
                # Process the submitted data.
                self.write_json_to_postgres(request.data)
                # Remove batch from active batches for the client.
                del self.active_batches[client_id]
                # Cancel the timer for this client.
                if client_id in self.timers:
                    self.timers[client_id].cancel()
                    del self.timers[client_id]
        return id_service_pb2.SubmitResponse(success=True)
    def return_batch(self, client_id):
        with self.lock:
            if client_id in self.active_batches:
                batch = self.active_batches[client_id]
                self.batcher.ids_queue.extendleft(batch)
                del self.active_batches[client_id]
    def save_state(self):
        with open(SERVICE_STATE_FILE, "wb") as f:
            pickle.dump(self.active_batches, f)
    def write_json_to_postgres(self, json_data):
        json_tuples = [(json_str,) for json_str in json_data]
        self.cur.executemany(
            "INSERT INTO reddit_data (data) VALUES (%s)",
            json_tuples,
        )
        self.conn.commit()
 def serve():
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
    id_batcher = IDBatcher()
    id_service = IDService(id_batcher)
    id_service_pb2_grpc.add_IDServiceServicer_to_server(id_service, server)
    # Save state when the service is interrupted.
    atexit.register(id_service.save_state)
    atexit.register(id_batcher.save_state)
    server.add_insecure_port("[::]:50051")
    server.start()
    server.wait_for_termination()
 if __name__ == "__main__":
    serve()
--- a/batcher_config.example.yaml
+++ b/batcher_config.example.yaml
@ -0,0 +1,4 @@
 db:
  user: postgres
  password: ""
  name: postgres
--- a/compile_proto.sh
+++ b/compile_proto.sh
@ -0,0 +1,5 @@
 #!/usr/bin/env bash
 full_path=$(realpath $0)
 dir=$(dirname $full_path)
 python -m grpc_tools.protoc --python_out=$dir --grpc_python_out=$dir $dir/id_service.proto -I=$dir
--- a/fetcher.py
+++ b/fetcher.py
@ -0,0 +1,171 @@
 import json
 import logging
 import os
 import time
 from logging.handlers import RotatingFileHandler
 import grpc
 import requests
 import yaml
 from fake_useragent import UserAgent
 import id_service_pb2
 import id_service_pb2_grpc
 # Load config.yaml.
 DIR = os.path.dirname(os.path.realpath(__file__))
 with open(os.path.join(DIR, "fetcher_config.yaml")) as f:
    config = yaml.load(f, yaml.FullLoader)
 log_file = os.path.join(DIR, "logs", "fetcher.log")
 # Configure logger.
 logger = logging.getLogger("fetcher")
 logger.setLevel(logging.DEBUG)
 formatter = logging.Formatter(
    fmt="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
 )
 # Add rotating file handler.
 rh = RotatingFileHandler(log_file, maxBytes=10 * 10**6, backupCount=20)
 rh.setLevel(logging.DEBUG)
 rh.setFormatter(formatter)
 logger.addHandler(rh)
 # Add stream handler.
 sh = logging.StreamHandler()
 sh.setLevel(logging.INFO)
 sh.setFormatter(formatter)
 logger.addHandler(sh)
 class RedditClient:
    ACCESS_TOKEN_URL = "https://www.reddit.com/api/v1/access_token"
    INFO_API_URL = "https://oauth.reddit.com/api/info.json"
    BACKOFF_INCREMENT = 5
    MIN_DELAY = 1
    def __init__(self):
        self.headers = {"User-Agent": UserAgent().random}
        self.last_request = time.time()
        self.backoff = 0
        self.ratelimit_remaining = 300
        self.ratelimit_reset = time.time()
        self.token = None
        self.token_expiration = time.time()
    def sleep_until_ready(self):
        if self.ratelimit_remaining == 0 and time.time() < self.ratelimit_reset:
            time.sleep(self.ratelimit_reset - time.time())
        delay = max(
            0, self.MIN_DELAY + self.backoff - (time.time() - self.last_request)
        )
        time.sleep(delay + self.backoff)
    def authorize(self):
        self.sleep_until_ready()
        self.last_request = time.time()
        res = requests.post(
            self.ACCESS_TOKEN_URL,
            params={
                "grant_type": "refresh_token",
                "refresh_token": config["reddit"]["refresh_token"],
            },
            headers=self.headers,
            auth=(config["reddit"]["client_id"], config["reddit"]["client_secret"]),
        )
        if res.status_code == 200:
            logger.info("Auth successful.")
            self.backoff = 0
            self.token = res.json()["access_token"]
            self.token_expiration = time.time() + res.json()["expires_in"]
        else:
            self.backoff += self.BACKOFF_INCREMENT
            logger.error("Auth failed.")
    def update_ratelimit(self, headers):
        if "x-ratelimit-remaining" in headers:
            self.ratelimit_remaining = int(float(headers["x-ratelimit-remaining"]))
            self.ratelimit_reset = time.time() + int(
                float(headers["x-ratelimit-reset"])
            )
    def fetch(self, ids):
        if self.token is None or self.token_expiration - time.time() < 60:
            self.authorize()
            return None
        self.sleep_until_ready()
        self.last_request = time.time()
        params = {"id": ",".join(ids)}
        logger.debug(f"GET /api/info.json?id={params['id']}")
        res = requests.get(
            self.INFO_API_URL,
            params=params,
            headers=self.headers | {"Authorization": f"bearer {self.token}"},
        )
        if res.status_code == 200:
            self.backoff = 0
            self.update_ratelimit(res.headers)
            logger.debug(f"Response: {res.text}")
            items = res.json()["data"]["children"]
            return [json.dumps(item) for item in items]
        else:
            self.update_ratelimit(res.headers)
            self.backoff += self.BACKOFF_INCREMENT
            logger.error(f"Bad status: {res.status_code}. Backoff: {self.backoff}.")
            return None
 class FetcherClient:
    def __init__(self, client_id, reddit_client):
        self.client_id = client_id
        self.reddit_client = reddit_client
        self.channel = grpc.insecure_channel(
            config["distributor_uri"],
            options=[
                ("grpc.max_send_message_length", 100 * 10**6),
                ("grpc.max_receive_message_length", 1 * 10**6),
            ],
        )
        self.stub = id_service_pb2_grpc.IDServiceStub(self.channel)
    def request_and_process_batch(self):
        # Request a batch of IDs from the server.
        response = self.stub.GetBatch(
            id_service_pb2.BatchRequest(client_id=self.client_id)
        )
        # Fetch data from reddit using the received IDs.
        data = None
        while data is None:
            data = self.reddit_client.fetch(response.ids)
        # Submit batch to the server.
        self.stub.SubmitBatch(
            id_service_pb2.SubmitRequest(client_id=self.client_id, data=data)
        )
    def run(self):
        # Continuously request and process batches.
        while True:
            try:
                self.request_and_process_batch()
                time.sleep(1)
            except grpc.RpcError as e:
                print(f"gRPC Error: {e.details()}")
                time.sleep(10)
 if __name__ == "__main__":
    reddit_client = RedditClient()
    fetcher_client = FetcherClient(config["id"], reddit_client)
    fetcher_client.run()
--- a/fetcher_config.example.yaml
+++ b/fetcher_config.example.yaml
@ -0,0 +1,6 @@
 id: fetcher-1
 distributor_uri: localhost:50051
 reddit:
  client_id: abc
  client_secret: abc
  refresh_token: 123-abc
--- a/id_service.proto
+++ b/id_service.proto
@ -0,0 +1,32 @@
 syntax = "proto3";
 package IDService;
 service IDService {
  // Fetch a batch of IDs.
  rpc GetBatch (BatchRequest) returns (BatchResponse) {}
  // Submit processed batch of data.
  rpc SubmitBatch (SubmitRequest) returns (SubmitResponse) {}
 }
 // The BatchRequest message contains the client id.
 message BatchRequest {
  string client_id = 1;
 }
 // The BatchResponse message contains the IDs.
 message BatchResponse {
  repeated string ids = 1;
 }
 // The SubmitRequest message contains the client id and a batch of data.
 message SubmitRequest {
  string client_id = 1;
  repeated string data = 2;
 }
 // The SubmitResponse message confirms successful batch processing.
 message SubmitResponse {
  bool success = 1;
 }
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,16 @@
 ```bash
 # ...install PostgreSQL...
 pip install -r requirements.txt
 # ...modify example yamls...
 mv batcher_config.example.yaml batcher_config.yaml
 mv fetcher_config.example.yaml fetcher_config.yaml
 mkdir logs
 bash compile_proto.sh
 # Then run both of these files:
 python batcher.py
 python fetcher.py
 ```
 Getting a refresh token:
 https://praw.readthedocs.io/en/stable/tutorials/refresh_token.html#obtaining-refresh-tokens
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,10 @@
 fake_useragent==1.1.3
 grpcio==1.56.0
 grpcio-tools==1.56.0
 numpy==1.24.2
 praw==7.7.0
 protobuf==4.23.4
 psycopg2_binary==2.9.6
 PyYAML==6.0
 Requests==2.31.0
 schedule==1.2.0