- Attempt to fix music field parsing for some entries

- Fix for crash on missing security properties for some entries - Image fetch timeout reduced from 5 seconds to 4 seconds
GBirkel · Jun 27, 2024 · 98430f1 · 98430f1
1 parent 5c8aba2
commit 98430f1
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 13 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,5 +1,11 @@
 ChangeLog - ljdump
 
+Version 1.7.6 - 2024-06-26
+
+- Attempt to fix music field parsing for some entries
+- Fix for crash on missing security properties for some entries
+- Image fetch timeout reduced from 5 seconds to 4 seconds
+
 Version 1.7.5 - 2024-05-20
 
 - Fixed time conversion error on Windows machines.

diff --git a/ljdump.py b/ljdump.py
@@ -3,7 +3,7 @@
 #
 # ljdump.py - livejournal archiver
 # Greg Hewgill, Garrett Birkel, et al
-# Version 1.7.5
+# Version 1.7.6
 #
 # LICENSE
 #
@@ -23,7 +23,7 @@
 #    misrepresented as being the original software.
 # 3. This notice may not be removed or altered from any source distribution.
 #
-# Copyright (c) 2005-2010 Greg Hewgill and contributors
+# Copyright (c) 2005-2024 Greg Hewgill and contributors
 
 import argparse, codecs, os, pickle, pprint, re, shutil, sys, xml.dom.minidom
 import xmlrpc.client
@@ -164,7 +164,7 @@ def authed(params):
 
                     # Process the event
 
-                    # Wanna do a bulk replace of something in your entire journal? This is now.
+                    # Wanna do a bulk replace of something in your entire journal? This is how.
                     #ev['event'] = re.sub('http://(edu.|staff.|)mmcs.sfedu.ru/~ulysses',
                     #                     'https://a-pelenitsyn.github.io/Files',
                     #                     str(ev['event']))
@@ -341,13 +341,26 @@ def authed(params):
     }))
 
     for t in r['tags']:
+
+        ts_private = '0'
+        ts_protected = '0'
+        ts_public = '0'
+        ts_level = '0'
+
+        if 'security' in t:
+            s = t['security']
+            if 'private' in s: ts_private = s['private']
+            if 'protected' in s: ts_protected = s['protected']
+            if 'public' in s: ts_public = s['public']
+            if 'level' in s: ts_level = s['level']
+
         insert_or_update_tag(cur, verbose,
             {   'name': t['name'],
                 'display': t['display'],
-                'security_private': t['security']['private'],
-                'security_protected': t['security']['protected'],
-                'security_public': t['security']['public'],
-                'security_level': t['display'],
+                'security_private': ts_private,
+                'security_protected': ts_protected,
+                'security_public': ts_public,
+                'security_level': ts_level,
                 'uses': t['uses']})
 
     #

diff --git a/ljdumpsqlite.py b/ljdumpsqlite.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 #
 # ljdumpsqlite.py - SQLite support tools for livejournal archiver
-# Version 1.7.5
+# Version 1.7.6
 #
 # LICENSE
 #
@@ -398,7 +398,7 @@ def insert_or_update_event(cur, verbose, ev):
 
         "props_commentalter": ev['props'].get("commentalter", None),
         "props_current_moodid": ev['props'].get("current_moodid", None),
-        "props_current_music": ev['props'].get("current_music", None),
+        "props_current_music": str(ev['props'].get("current_music", None)),
         "props_import_source": ev['props'].get("import_source", None),
         "props_interface": ev['props'].get("interface", None),
         "props_opt_backdated": ev['props'].get("opt_backdated", None),

diff --git a/ljdumptohtml.py b/ljdumptohtml.py
@@ -3,7 +3,7 @@
 #
 # ljdumptohtml.py - convert sqlite livejournal archive to html pages 
 # Garrett Birkel et al
-# Version 1.7.5
+# Version 1.7.6
 #
 # LICENSE
 #
@@ -464,14 +464,14 @@ def create_single_entry_page(journal_short_name, entry, comments, image_urls_to_
         bottomnav_a.text = u"Next Entry"
 
     # We're going to be weird here, because journal entries often contain weird and
-    # broken HTML.  We really can't rely on parsing a journal entry into XML and then
+    # broken HTML.  We can't rely on parsing a journal entry into XML and then
     # embedding it as elements.  There is also no clean way to slipstream string data
     # into the XML during the rendering process (it either gets parsed as usual before
     # insertion, or run through an escaper).  So we're going to render the document as
     # text right here, and then do a text search (a split) to find the div with id
     # "entry-content-insertion-point".  Then we'll interleave the entry contents and
     # re-assemble the document.  It's hacky but it avoids the need to police the HTML
-    # skills of thousands of users whose entires render fine in Dreamwidth.
+    # skills of thousands of users whose entries render fine in Dreamwidth.
     html_as_string = ET.tostring(page, encoding="utf-8", method="html").decode('utf-8')
     html_split_on_entry_body = html_as_string.split(u'<div class="entry-content" id="entry-content-insertion-point"></div>')
 
@@ -642,7 +642,7 @@ def download_entry_image(img_url, journal_short_name, subfolder, image_id, entry
             # Only necessary for Dreamwidth-hosted images, but does no harm generally.
             headers = {'Referer': entry_url, 'Cookie': "ljuniq="+ljuniq}
 
-        image_req = urllib.request.urlopen(urllib.request.Request(img_url, headers = headers), timeout = 5)
+        image_req = urllib.request.urlopen(urllib.request.Request(img_url, headers = headers), timeout = 4)
         if image_req.headers.get_content_maintype() != 'image':
             print('Content type %s not expected, image skipped: %s' % (image_req.headers.get_content_maintype(), img_url))
             return (1, None)