papersbot.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# PapersBot
#
# purpose:  read journal RSS feeds and tweet selected entries
# license:  MIT License
# author:   Nina Miolane
# e-mail:   nmiolane@gmail.com
# inspired by: https://github.com/fxcoudert/PapersBot

import imghdr
import json
import os
import random
import re
import sys
import tempfile
import time
import urllib
import yaml

import bs4
import feedparser
import tweepy


# This is the regular expression that selects the papers of interest
regex = re.compile(r"""( geometric.deep.learning |
                         geometric.machine.learning |
                         geometric.neural.net |
                         geometric.statistics |
                         geomstats | 
                         geoopt |
                         hyperbolic.data |
                         non-euclidean.data |
                         (?=.*non-euclidean)(?=.*deep.learning) |
                         (?=.*non-euclidean)(?=.*machine.learning) |
                         (?=.*non-euclidean)(?=.*neural.net) |
                         (?=.*non-euclidean)(?=.*statistics) |
                         manopt |
                         mctorch |
                         (?=.*riemannian)(?=.*data) |
                         (?=.*riemannian)(?=.*deep.learning) |
                         (?=.*riemannian)(?=.*statistic) |
                         (?=.*riemannian)(?=.*machine.learning) |
                         (?=.*riemannian)(?=.*neural.net) |
                         theanogeometry
                       )
                   """, re.IGNORECASE | re.VERBOSE)

# We select entries based on title or summary (abstract, for some feeds)
def entryMatches(entry):
    # Malformed entry
    if "title" not in entry:
        return False

    if regex.search(entry.title):
        return True
    if "summary" in entry:
        return regex.search(entry.summary)
    else:
        return False


# Find the URL for an image associated with the entry
def findImage(entry):
    if "description" not in entry:
        return

    soup = bs4.BeautifulSoup(entry.description, "html.parser")
    img = soup.find("img")
    if img:
        img = img["src"]
        if len(img) == 0:
            return
        # If address is relative, append root URL
        if img[0] == "/":
            p = urllib.parse.urlparse(entry.id)
            img = f"{p.scheme}://{p.netloc}" + img

    return img


# Convert string from HTML to plain text
def htmlToText(s):
    return bs4.BeautifulSoup(s, "html.parser").get_text()


def downloadImage(url):
    if not url:
        return None

    try:
        img, _ = urllib.request.urlretrieve(url)
    except Exception:
        return None
    ext = imghdr.what(img)
    res = img + "." + ext
    os.rename(img, res)

    # Images smaller than 4 KB have a problem, and Twitter will complain
    if os.path.getsize(res) < 4096:
        os.remove(res)
        return None

    return res


# Connect to Twitter and authenticate
#   Credentials are passed in the environment,
#   or stored in "credentials.yml" which contains four lines:
#   CONSUMER_KEY: "x1F3s..."
#   CONSUMER_SECRET: "3VNg..."
#   ACCESS_KEY: "7109..."
#   ACCESS_SECRET: "AdnA..."
#
def initTwitter():
    if 'CONSUMER_KEY' in os.environ:
        cred = {'CONSUMER_KEY': os.environ['CONSUMER_KEY'],
                'CONSUMER_SECRET': os.environ['CONSUMER_SECRET'],
                'ACCESS_KEY': os.environ['ACCESS_KEY'],
                'ACCESS_SECRET': os.environ['ACCESS_SECRET']}
    else:
        with open("credentials.yml", "r") as f:
            cred = yaml.safe_load(f)

    auth = tweepy.OAuthHandler(cred["CONSUMER_KEY"], cred["CONSUMER_SECRET"])
    auth.set_access_token(cred["ACCESS_KEY"], cred["ACCESS_SECRET"])
    return tweepy.API(auth)


def getTwitterConfig(api):
    # Check for cached configuration, no more than a day old
    if os.path.isfile("twitter_config.dat"):
        mtime = os.stat("twitter_config.dat").st_mtime
        if time.time() - mtime < 24 * 60 * 60:
            with open("twitter_config.dat", "r") as f:
                return json.load(f)

    # Otherwise, query the Twitter API and cache the result
    config = api.configuration()
    with open("twitter_config.dat", "w") as f:
        json.dump(config, f)
    return config


# Read our list of feeds from file
def readFeedsList():
    with open("feeds.txt", "r") as f:
        feeds = [s.partition("#")[0].strip() for s in f]
        return [s for s in feeds if s]


# Remove unwanted text some journals insert into the feeds
def cleanText(s):
    # Annoying ASAP tags
    s = s.replace("[ASAP]", "")
    # Some feeds have LF characeters
    s = s.replace("\x0A", "")
    # Remove (arXiv:1903.00279v1 [cond-mat.mtrl-sci])
    s = re.sub(r"\(arXiv:.+\)", "", s)
    # Remove multiple spaces, leading and trailing space
    return re.sub("\\s\\s+", " ", s).strip()


# Read list of feed items already posted
def readPosted():
    try:
        with open("posted.dat", "r") as f:
            return f.read().splitlines()
    except Exception:
        return []


class PapersBot:
    posted = []
    n_seen = 0
    n_tweeted = 0

    def __init__(self, doTweet=True):
        self.feeds = readFeedsList()
        self.posted = readPosted()

        # Read parameters from configuration file
        try:
            with open("config.yml", "r") as f:
                config = yaml.safe_load(f)
        except Except:
            config = {}
        self.throttle = config.get("throttle", 0)
        self.wait_time = config.get("wait_time", 5)
        self.shuffle_feeds = config.get("shuffle_feeds", True)
        self.blacklist = config.get("blacklist", [])
        self.blacklist = [re.compile(s) for s in self.blacklist]

        # Shuffle feeds list
        if self.shuffle_feeds:
            random.shuffle(self.feeds)

        # Connect to Twitter, unless requested not to
        if doTweet:
            self.api = initTwitter()
        else:
            self.api = None

        # Determine maximum tweet length
        if doTweet:
            twconfig = getTwitterConfig(self.api)
            urllen = max(twconfig["short_url_length"], twconfig["short_url_length_https"])
            imglen = twconfig["characters_reserved_per_media"]
        else:
            urllen = 23
            imglen = 24
        self.maxlength = 280 - (urllen + 1) - imglen

        # Start-up banner
        print(f"This is PapersBot running at {time.strftime('%Y-%m-%d %H:%M:%S %Z')}")
        if self.api:
            timeline = self.api.user_timeline(count=1)
            if len(timeline) > 0:
                print(f"Last tweet was posted at {timeline[0].created_at} (UTC)")
            else:
                print(f"No tweets posted yet? Welcome, new user!")
        print(f"Feed list has {len(self.feeds)} feeds\n")

    # Add to tweets posted
    def addToPosted(self, url):
        with open("posted.dat", "a+") as f:
            print(url, file=f)
        self.posted.append(url)

    # Send a tweet for a given feed entry
    def sendTweet(self, entry):
        title = cleanText(htmlToText(entry.title))
        length = self.maxlength

        # Usually the ID is the canonical URL, but not always
        if entry.id[:8] == "https://" or entry.id[:7] == "http://":
            url = entry.id
        else:
            url = entry.link

        # URL may be malformed
        if not (url[:8] == "https://" or url[:7] == "http://"):
            print(f"INVALID URL: {url}\n")
            return

        tweet_body = title[:length] + " " + url

        # URL may match our blacklist
        for regexp in self.blacklist:
            if regexp.search(url):
                print(f"BLACKLISTED: {tweet_body}\n")
                self.addToPosted(entry.id)
                return

        media = None
        image = findImage(entry)
        image_file = downloadImage(image)
        if image_file:
            print(f"IMAGE: {image}")
            if self.api:
                media = [self.api.media_upload(image_file).media_id]
            os.remove(image_file)

        print(f"TWEET: {tweet_body}\n")
        if self.api:
            try:
                self.api.update_status(tweet_body, media_ids=media)
            except tweepy.error.TweepError as e:
                if e.api_code == 187:
                    print("ERROR: Tweet refused as duplicate\n")
                else:
                    print(f"ERROR: Tweet refused, {e.reason}\n")
                    sys.exit(1)

        self.addToPosted(entry.id)
        self.n_tweeted += 1

        if self.api:
            time.sleep(self.wait_time)

    # Main function, iterating over feeds and posting new items
    def run(self):
        for feed in self.feeds:
            parsed_feed = feedparser.parse(feed)
            for entry in parsed_feed.entries:
                if entryMatches(entry):
                    self.n_seen += 1
                    # If no ID provided, use the link as ID
                    if "id" not in entry:
                        entry.id = entry.link
                    if entry.id not in self.posted:
                        self.sendTweet(entry)
                        # Bail out if we have reached max number of tweets
                        if self.throttle > 0 and self.n_tweeted >= self.throttle:
                            print(f"Max number of papers met ({self.throttle}), stopping now")
                            return

    # Print statistics of a given run
    def printStats(self):
        print(f"Number of relevant papers: {self.n_seen}")
        print(f"Number of papers tweeted: {self.n_tweeted}")

    # Print out the n top tweets (most liked and RT'ed)
    def printTopTweets(self, count=20):
        tweets = self.api.user_timeline(count=200)
        oldest = tweets[-1].created_at
        print(f"Top {count} recent tweets, by number of RT and likes, since {oldest}:\n")

        tweets = [(t.retweet_count + t.favorite_count, t.id, t) for t in tweets]
        tweets.sort(reverse=True)
        for _, _, t in tweets[0:count]:
            url = f"https://twitter.com/{t.user.screen_name}/status/{t.id}"
            print(f"{t.retweet_count} RT {t.favorite_count} likes: {url}")
            print(f"    {t.created_at}")
            print(f"    {t.text}\n")


def main():
    # Make sure all options are correctly typed
    options_allowed = ["--do-not-tweet", "--top-tweets"]
    for arg in sys.argv[1:]:
        if arg not in options_allowed:
            print(f"Unknown option: {arg}")
            sys.exit(1)

    # Initialize our bot
    doTweet = "--do-not-tweet" not in sys.argv
    bot = PapersBot(doTweet)

    # We can print top tweets
    if "--top-tweets" in sys.argv:
        bot.printTopTweets()
        sys.exit(0)

    bot.run()
    bot.printStats()


if __name__ == "__main__":
    main()