Changing "stop at fifty" command line flag to a "max n" argument, wit…

…h a default of 400. Version bump.
GBirkel · Sep 8, 2024 · 163796c · 163796c
1 parent 75436ad
commit 163796c
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 45 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,5 +1,9 @@
 ChangeLog - ljdump
 
+Version 1.7.8 - 2024-09-07
+
+- Changing "stop at fifty" command line flag to a "max n" argument, with a default of 400.
+
 Version 1.7.7 - 2024-08-14
 
 - Slightly better unicode handling for tags and music

diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ So, it's not possible to get the local HTML to look exactly like your online jou
 
 ## How to use ##
 
-__To get the full archive of a very large journal, you may need to run the script a few times in a row, until it says there are no new changes.__
+__To get the full archive of a very large journal, you may need to run the script multiple times, until it says there are no new changes.__  Take note of the `--max` command line argument (described below) which can be used to speed this up.
 
 ### Windows ###
 
@@ -100,9 +100,9 @@ Makes the script print a lot less status information to the console as it runs.
 
 By defualt, this script constructs HTML pages after saving everything to the SQLite database.  This flag skips the HTML.
 
-`--fifty`
+`--max n`
 
-Only fetch 50 of the entries that are new since the last sync, then stop.  Useful for testing the script output before you commit to downloading your whole journal.
+Fetch a maximum of n entries and comments that are new since the last sync, then stop.  The default is 400, but can be set lower if you want to run a test, or higher if you want to download your whole journal at once and are confident the server won't complain.  I recommend using the default at least once, then using a value of 1500 afterward until you're caught up.
 
 `--cache_images`
 

diff --git a/ljdump.py b/ljdump.py
@@ -3,7 +3,7 @@
 #
 # ljdump.py - livejournal archiver
 # Greg Hewgill, Garrett Birkel, et al
-# Version 1.7.7
+# Version 1.7.8
 #
 # LICENSE
 #
@@ -70,7 +70,7 @@ def gettext(e):
     return e[0].firstChild.nodeValue
 
 
-def ljdump(journal_server, username, password, journal_short_name, ljuniq=None, verbose=True, stop_at_fifty=False, make_pages=False, cache_images=False, retry_images=True):
+def ljdump(journal_server, username, password, journal_short_name, ljuniq=None, verbose=True, max_to_fetch=100, make_pages=False, cache_images=False, retry_images=True):
 
     m = re.search("(.*)/interface/xmlrpc", journal_server)
     if m:
@@ -116,7 +116,7 @@ def authed(params):
     # Entries (events)
     #
 
-    original_last_sync = sync_status.last_sync
+    original_last_sync = sync_status['last_sync']
 
     # The following code doesn't work because the server rejects our repeated calls.
     # https://www.livejournal.com/doc/server/ljp.csp.xml-rpc.getevents.html
@@ -138,14 +138,18 @@ def authed(params):
     #pprint.pprint(r)
     #os._exit(os.EX_OK)
 
+    # There is apparently no support for fetching pages here, so repeated calls
+    # to this will fetch overlapping lists of events (which can be quite long)
+    # as we catch up to the present.  If getevents syncitems (above) worked properly
+    # we could avoid this.
     r = server.LJ.XMLRPC.syncitems(authed({
         'ver': 1,
-        'lastsync': sync_status.last_sync, # this one is not helpful when you want update existing stuff
+        'lastsync': sync_status['last_sync'],
         'usejournal': journal_short_name,
     }))
 
     if verbose:
-        print("Sync items to handle: %s" % (len(r['syncitems'])))
+        print("Sync items to process: %s out of %s returned." % (min(max_to_fetch, len(r['syncitems'])), len(r['syncitems'])))
 
     for item in r['syncitems']:
         if item['item'][0] == 'L':
@@ -176,7 +180,7 @@ def authed(params):
 
                     insert_or_update_event(cur, verbose, ev)
 
-                    if stop_at_fifty and new_entry_count > 49:
+                    if new_entry_count > max_to_fetch:
                         break
 
                 else:
@@ -188,14 +192,16 @@ def authed(params):
                 errors += 1
 
         # Assuming these emerge from the server in order by date from least to most recent...
-        sync_status.last_sync = item['time']
+        sync_status['last_sync'] = item['time']
 
     #
     # Comments
     #
 
+    max_comment_id = sync_status['last_max_comment_id']
+
     if verbose:
-        print("Fetching journal comment metadata for \"%s\" starting at ID " % (journal_short_name, max_comment_id))
+        print("Fetching journal comment metadata for \"%s\" starting at ID %d" % (journal_short_name, max_comment_id))
 
     try:
         f = open("%s/comment.meta" % journal_short_name)
@@ -204,43 +210,44 @@ def authed(params):
     except:
         metacache = {}
 
-    max_comment_id = sync_status.last_max_comment_id
+    meta_comments_fetched_count = 0
+
     new_max_comment_id = max_comment_id
-    while True:
-        url = "/export_comments.bml?get=comment_meta&startid=%d%s" % (max_comment_id+1, authas)
-        if stop_at_fifty:
-            url = "/export_comments.bml?get=comment_meta&startid=%d&numitems=50%s" % (max_comment_id+1, authas)
+    url = "/export_comments.bml?get=comment_meta&startid=%d&numitems=%d%s" % (new_max_comment_id+1, max_to_fetch, authas)
+    try:
         try:
-            try:
-                r = urllib.request.urlopen(
-                        urllib.request.Request(
-                            journal_server + url,
-                            headers = {'Cookie': "ljsession="+ljsession}
-                       )
+            r = urllib.request.urlopen(
+                    urllib.request.Request(
+                        journal_server + url,
+                        headers = {'Cookie': "ljsession="+ljsession}
                     )
-                meta = xml.dom.minidom.parse(r)
-            except Exception as x:
-                print("*** Error fetching comment meta, possibly not community maintainer?")
-                print("***", x)
-                break
-        finally:
-            try:
-                r.close()
-            except AttributeError: # r is sometimes a dict for unknown reasons
-                pass
+                )
+            meta = xml.dom.minidom.parse(r)
+        except Exception as x:
+            print("*** Error fetching comment meta, possibly not community maintainer?")
+            print("***", x)
+    finally:
+        try:
+            r.close()
+        except AttributeError: # r is sometimes a dict for unknown reasons
+            pass
 
         for c in meta.getElementsByTagName("comment"):
             id = int(c.getAttribute("id"))
+            meta_comments_fetched_count += 1
             metacache[id] = {
                 'posterid': c.getAttribute("posterid"),
                 'state': c.getAttribute("state"),
             }
             if id > new_max_comment_id:
                 new_max_comment_id = id
+
+        maxid = int(meta.getElementsByTagName("maxid")[0].firstChild.nodeValue)
+        if verbose:
+            print("Fetched %d metadata entries. Our max_comment_id is now %s. Highest comment_id on server is %d." % (meta_comments_fetched_count, new_max_comment_id, maxid))
+
         for u in meta.getElementsByTagName("usermap"):
             insert_or_update_user_in_map(cur, verbose, u.getAttribute("id"), u.getAttribute("user"))
-        if new_max_comment_id >= int(meta.getElementsByTagName("maxid")[0].firstChild.nodeValue):
-            break
 
     usermap = get_users_map(cur, verbose)
 
@@ -271,7 +278,7 @@ def authed(params):
             try:
                 r = urllib.request.urlopen(
                     urllib.request.Request(
-                        journal_server+"/export_comments.bml?get=comment_body&startid=%d&numitems=50%s" % (commentid, authas),
+                        journal_server+"/export_comments.bml?get=comment_body&startid=%d&numitems=%d%s" % (commentid, meta_comments_fetched_count, authas),
                         headers = {'Cookie': "ljsession="+ljsession}
                     )
                 )
@@ -286,6 +293,9 @@ def authed(params):
             id = int(c.getAttribute("id"))
             if id in comments_already_fetched:
                 continue
+            # We fetch in chunks, so may have actually fetched bodies past the metadata we've collected.
+            if id > new_max_comment_id:
+                continue
             jitemid = c.getAttribute("jitemid")
 
             db_comment = {
@@ -409,7 +419,7 @@ def authed(params):
                     'filename': (picfn+ext),
                     'url': userpics[p]})
 
-    sync_status.last_max_comment_id = new_max_comment_id
+    sync_status['last_max_comment_id'] = new_max_comment_id
 
     set_sync_status(cur, sync_status)
 
@@ -439,8 +449,8 @@ def authed(params):
                       help="reduce log output")
     args.add_argument("--no_html", "-n", action='store_false', dest='make_pages',
                       help="don't process the journal data into HTML files.")
-    args.add_argument("--fifty", "-f", action='store_true', dest='fifty',
-                      help="stop after synchronizing 50 entries, and do not fetch anything else")
+    args.add_argument('--max', type=int, default=400, dest='max_to_fetch',
+                      help='Maximum number of entries and comments to fetch at a time.  Default is 400.')
     args.add_argument("--cache_images", "-i", action='store_true', dest='cache_images',
                       help="build a cache of images referenced in entries")
     args.add_argument("--dont_retry_images", "-d", action='store_false', dest='retry_images',
@@ -502,7 +512,7 @@ def authed(params):
             ljuniq=ljuniq,
             journal_short_name=journal,
             verbose=args.verbose,
-            stop_at_fifty=args.fifty,
+            max_to_fetch=args.max_to_fetch,
             make_pages=args.make_pages,
             cache_images=args.cache_images,
             retry_images=args.retry_images

diff --git a/ljdumpsqlite.py b/ljdumpsqlite.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 #
 # ljdumpsqlite.py - SQLite support tools for livejournal archiver
-# Version 1.7.7
+# Version 1.7.8
 #
 # LICENSE
 #
@@ -265,7 +265,7 @@ def get_sync_status_or_defaults(cur, last_sync, last_max_comment_id):
     else:
         last_sync = row[0]
         last_max_comment_id = row[1]
-    status = {"last_sync": last_sync , "last_max_comment_id": last_max_comment_id}
+    status = {"last_sync": last_sync, "last_max_comment_id": last_max_comment_id}
     return status
 
 
@@ -566,7 +566,7 @@ def insert_or_update_comment(cur, verbose, comment):
     row = cur.fetchone()
     if not row:
         if verbose:
-            print('Adding new comment by %s for entry %s with id %s' % (comment['user'], comment['entryid'], comment['id']))
+            print('Adding new comment by %s for entry %s with ID %s' % (comment['user'], comment['entryid'], comment['id']))
         cur.execute("""
             INSERT INTO comments (
                 id,
@@ -592,7 +592,7 @@ def insert_or_update_comment(cur, verbose, comment):
         return True
     else:
         if verbose:
-            print('Updating existing comment by %s for entry %s with id %s' % (comment['user'], comment['entryid'], comment['id']))
+            print('Updating existing comment by %s for entry %s with ID %s' % (comment['user'], comment['entryid'], comment['id']))
         cur.execute("""
             UPDATE comments SET
                 entryid = :entryid,
@@ -930,7 +930,7 @@ def set_sync_status(cur, status):
     :param cur: database cursor
     :param status: sync status record
     """
-    cur.execute("UPDATE status SET lastsync = ?, lastmaxcommentid = ?", (status.last_sync, status.last_max_comment_id))
+    cur.execute("UPDATE status SET lastsync = ?, lastmaxcommentid = ?", (status['last_sync'], status['last_max_comment_id']))
 
 
 def finish_with_database(conn, cur):

diff --git a/ljdumptohtml.py b/ljdumptohtml.py
@@ -3,7 +3,7 @@
 #
 # ljdumptohtml.py - convert sqlite livejournal archive to html pages 
 # Garrett Birkel et al
-# Version 1.7.7
+# Version 1.7.8
 #
 # LICENSE
 #