Skip to content

Commit

Permalink
Changing "stop at fifty" command line flag to a "max n" argument, wit…
Browse files Browse the repository at this point in the history
…h a default of 400. Version bump.
  • Loading branch information
GBirkel committed Sep 8, 2024
1 parent 75436ad commit 163796c
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 45 deletions.
4 changes: 4 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
ChangeLog - ljdump

Version 1.7.8 - 2024-09-07

- Changing "stop at fifty" command line flag to a "max n" argument, with a default of 400.

Version 1.7.7 - 2024-08-14

- Slightly better unicode handling for tags and music
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ So, it's not possible to get the local HTML to look exactly like your online jou

## How to use ##

__To get the full archive of a very large journal, you may need to run the script a few times in a row, until it says there are no new changes.__
__To get the full archive of a very large journal, you may need to run the script multiple times, until it says there are no new changes.__ Take note of the `--max` command line argument (described below) which can be used to speed this up.

### Windows ###

Expand Down Expand Up @@ -100,9 +100,9 @@ Makes the script print a lot less status information to the console as it runs.

By defualt, this script constructs HTML pages after saving everything to the SQLite database. This flag skips the HTML.

`--fifty`
`--max n`

Only fetch 50 of the entries that are new since the last sync, then stop. Useful for testing the script output before you commit to downloading your whole journal.
Fetch a maximum of n entries and comments that are new since the last sync, then stop. The default is 400, but can be set lower if you want to run a test, or higher if you want to download your whole journal at once and are confident the server won't complain. I recommend using the default at least once, then using a value of 1500 afterward until you're caught up.

`--cache_images`

Expand Down
82 changes: 46 additions & 36 deletions ljdump.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#
# ljdump.py - livejournal archiver
# Greg Hewgill, Garrett Birkel, et al
# Version 1.7.7
# Version 1.7.8
#
# LICENSE
#
Expand Down Expand Up @@ -70,7 +70,7 @@ def gettext(e):
return e[0].firstChild.nodeValue


def ljdump(journal_server, username, password, journal_short_name, ljuniq=None, verbose=True, stop_at_fifty=False, make_pages=False, cache_images=False, retry_images=True):
def ljdump(journal_server, username, password, journal_short_name, ljuniq=None, verbose=True, max_to_fetch=100, make_pages=False, cache_images=False, retry_images=True):

m = re.search("(.*)/interface/xmlrpc", journal_server)
if m:
Expand Down Expand Up @@ -116,7 +116,7 @@ def authed(params):
# Entries (events)
#

original_last_sync = sync_status.last_sync
original_last_sync = sync_status['last_sync']

# The following code doesn't work because the server rejects our repeated calls.
# https://www.livejournal.com/doc/server/ljp.csp.xml-rpc.getevents.html
Expand All @@ -138,14 +138,18 @@ def authed(params):
#pprint.pprint(r)
#os._exit(os.EX_OK)

# There is apparently no support for fetching pages here, so repeated calls
# to this will fetch overlapping lists of events (which can be quite long)
# as we catch up to the present. If getevents syncitems (above) worked properly
# we could avoid this.
r = server.LJ.XMLRPC.syncitems(authed({
'ver': 1,
'lastsync': sync_status.last_sync, # this one is not helpful when you want update existing stuff
'lastsync': sync_status['last_sync'],
'usejournal': journal_short_name,
}))

if verbose:
print("Sync items to handle: %s" % (len(r['syncitems'])))
print("Sync items to process: %s out of %s returned." % (min(max_to_fetch, len(r['syncitems'])), len(r['syncitems'])))

for item in r['syncitems']:
if item['item'][0] == 'L':
Expand Down Expand Up @@ -176,7 +180,7 @@ def authed(params):

insert_or_update_event(cur, verbose, ev)

if stop_at_fifty and new_entry_count > 49:
if new_entry_count > max_to_fetch:
break

else:
Expand All @@ -188,14 +192,16 @@ def authed(params):
errors += 1

# Assuming these emerge from the server in order by date from least to most recent...
sync_status.last_sync = item['time']
sync_status['last_sync'] = item['time']

#
# Comments
#

max_comment_id = sync_status['last_max_comment_id']

if verbose:
print("Fetching journal comment metadata for \"%s\" starting at ID " % (journal_short_name, max_comment_id))
print("Fetching journal comment metadata for \"%s\" starting at ID %d" % (journal_short_name, max_comment_id))

try:
f = open("%s/comment.meta" % journal_short_name)
Expand All @@ -204,43 +210,44 @@ def authed(params):
except:
metacache = {}

max_comment_id = sync_status.last_max_comment_id
meta_comments_fetched_count = 0

new_max_comment_id = max_comment_id
while True:
url = "/export_comments.bml?get=comment_meta&startid=%d%s" % (max_comment_id+1, authas)
if stop_at_fifty:
url = "/export_comments.bml?get=comment_meta&startid=%d&numitems=50%s" % (max_comment_id+1, authas)
url = "/export_comments.bml?get=comment_meta&startid=%d&numitems=%d%s" % (new_max_comment_id+1, max_to_fetch, authas)
try:
try:
try:
r = urllib.request.urlopen(
urllib.request.Request(
journal_server + url,
headers = {'Cookie': "ljsession="+ljsession}
)
r = urllib.request.urlopen(
urllib.request.Request(
journal_server + url,
headers = {'Cookie': "ljsession="+ljsession}
)
meta = xml.dom.minidom.parse(r)
except Exception as x:
print("*** Error fetching comment meta, possibly not community maintainer?")
print("***", x)
break
finally:
try:
r.close()
except AttributeError: # r is sometimes a dict for unknown reasons
pass
)
meta = xml.dom.minidom.parse(r)
except Exception as x:
print("*** Error fetching comment meta, possibly not community maintainer?")
print("***", x)
finally:
try:
r.close()
except AttributeError: # r is sometimes a dict for unknown reasons
pass

for c in meta.getElementsByTagName("comment"):
id = int(c.getAttribute("id"))
meta_comments_fetched_count += 1
metacache[id] = {
'posterid': c.getAttribute("posterid"),
'state': c.getAttribute("state"),
}
if id > new_max_comment_id:
new_max_comment_id = id

maxid = int(meta.getElementsByTagName("maxid")[0].firstChild.nodeValue)
if verbose:
print("Fetched %d metadata entries. Our max_comment_id is now %s. Highest comment_id on server is %d." % (meta_comments_fetched_count, new_max_comment_id, maxid))

for u in meta.getElementsByTagName("usermap"):
insert_or_update_user_in_map(cur, verbose, u.getAttribute("id"), u.getAttribute("user"))
if new_max_comment_id >= int(meta.getElementsByTagName("maxid")[0].firstChild.nodeValue):
break

usermap = get_users_map(cur, verbose)

Expand Down Expand Up @@ -271,7 +278,7 @@ def authed(params):
try:
r = urllib.request.urlopen(
urllib.request.Request(
journal_server+"/export_comments.bml?get=comment_body&startid=%d&numitems=50%s" % (commentid, authas),
journal_server+"/export_comments.bml?get=comment_body&startid=%d&numitems=%d%s" % (commentid, meta_comments_fetched_count, authas),
headers = {'Cookie': "ljsession="+ljsession}
)
)
Expand All @@ -286,6 +293,9 @@ def authed(params):
id = int(c.getAttribute("id"))
if id in comments_already_fetched:
continue
# We fetch in chunks, so may have actually fetched bodies past the metadata we've collected.
if id > new_max_comment_id:
continue
jitemid = c.getAttribute("jitemid")

db_comment = {
Expand Down Expand Up @@ -409,7 +419,7 @@ def authed(params):
'filename': (picfn+ext),
'url': userpics[p]})

sync_status.last_max_comment_id = new_max_comment_id
sync_status['last_max_comment_id'] = new_max_comment_id

set_sync_status(cur, sync_status)

Expand Down Expand Up @@ -439,8 +449,8 @@ def authed(params):
help="reduce log output")
args.add_argument("--no_html", "-n", action='store_false', dest='make_pages',
help="don't process the journal data into HTML files.")
args.add_argument("--fifty", "-f", action='store_true', dest='fifty',
help="stop after synchronizing 50 entries, and do not fetch anything else")
args.add_argument('--max', type=int, default=400, dest='max_to_fetch',
help='Maximum number of entries and comments to fetch at a time. Default is 400.')
args.add_argument("--cache_images", "-i", action='store_true', dest='cache_images',
help="build a cache of images referenced in entries")
args.add_argument("--dont_retry_images", "-d", action='store_false', dest='retry_images',
Expand Down Expand Up @@ -502,7 +512,7 @@ def authed(params):
ljuniq=ljuniq,
journal_short_name=journal,
verbose=args.verbose,
stop_at_fifty=args.fifty,
max_to_fetch=args.max_to_fetch,
make_pages=args.make_pages,
cache_images=args.cache_images,
retry_images=args.retry_images
Expand Down
10 changes: 5 additions & 5 deletions ljdumpsqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
#
# ljdumpsqlite.py - SQLite support tools for livejournal archiver
# Version 1.7.7
# Version 1.7.8
#
# LICENSE
#
Expand Down Expand Up @@ -265,7 +265,7 @@ def get_sync_status_or_defaults(cur, last_sync, last_max_comment_id):
else:
last_sync = row[0]
last_max_comment_id = row[1]
status = {"last_sync": last_sync , "last_max_comment_id": last_max_comment_id}
status = {"last_sync": last_sync, "last_max_comment_id": last_max_comment_id}
return status


Expand Down Expand Up @@ -566,7 +566,7 @@ def insert_or_update_comment(cur, verbose, comment):
row = cur.fetchone()
if not row:
if verbose:
print('Adding new comment by %s for entry %s with id %s' % (comment['user'], comment['entryid'], comment['id']))
print('Adding new comment by %s for entry %s with ID %s' % (comment['user'], comment['entryid'], comment['id']))
cur.execute("""
INSERT INTO comments (
id,
Expand All @@ -592,7 +592,7 @@ def insert_or_update_comment(cur, verbose, comment):
return True
else:
if verbose:
print('Updating existing comment by %s for entry %s with id %s' % (comment['user'], comment['entryid'], comment['id']))
print('Updating existing comment by %s for entry %s with ID %s' % (comment['user'], comment['entryid'], comment['id']))
cur.execute("""
UPDATE comments SET
entryid = :entryid,
Expand Down Expand Up @@ -930,7 +930,7 @@ def set_sync_status(cur, status):
:param cur: database cursor
:param status: sync status record
"""
cur.execute("UPDATE status SET lastsync = ?, lastmaxcommentid = ?", (status.last_sync, status.last_max_comment_id))
cur.execute("UPDATE status SET lastsync = ?, lastmaxcommentid = ?", (status['last_sync'], status['last_max_comment_id']))


def finish_with_database(conn, cur):
Expand Down
2 changes: 1 addition & 1 deletion ljdumptohtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#
# ljdumptohtml.py - convert sqlite livejournal archive to html pages
# Garrett Birkel et al
# Version 1.7.7
# Version 1.7.8
#
# LICENSE
#
Expand Down

0 comments on commit 163796c

Please sign in to comment.