Skip to content

Commit

Permalink
- Attempt to fix music field parsing for some entries
Browse files Browse the repository at this point in the history
- Fix for crash on missing security properties for some entries
- Image fetch timeout reduced from 5 seconds to 4 seconds
  • Loading branch information
GBirkel committed Jun 27, 2024
1 parent 5c8aba2 commit 98430f1
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 13 deletions.
6 changes: 6 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
ChangeLog - ljdump

Version 1.7.6 - 2024-06-26

- Attempt to fix music field parsing for some entries
- Fix for crash on missing security properties for some entries
- Image fetch timeout reduced from 5 seconds to 4 seconds

Version 1.7.5 - 2024-05-20

- Fixed time conversion error on Windows machines.
Expand Down
27 changes: 20 additions & 7 deletions ljdump.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#
# ljdump.py - livejournal archiver
# Greg Hewgill, Garrett Birkel, et al
# Version 1.7.5
# Version 1.7.6
#
# LICENSE
#
Expand All @@ -23,7 +23,7 @@
# misrepresented as being the original software.
# 3. This notice may not be removed or altered from any source distribution.
#
# Copyright (c) 2005-2010 Greg Hewgill and contributors
# Copyright (c) 2005-2024 Greg Hewgill and contributors

import argparse, codecs, os, pickle, pprint, re, shutil, sys, xml.dom.minidom
import xmlrpc.client
Expand Down Expand Up @@ -164,7 +164,7 @@ def authed(params):

# Process the event

# Wanna do a bulk replace of something in your entire journal? This is now.
# Wanna do a bulk replace of something in your entire journal? This is how.
#ev['event'] = re.sub('http://(edu.|staff.|)mmcs.sfedu.ru/~ulysses',
# 'https://a-pelenitsyn.github.io/Files',
# str(ev['event']))
Expand Down Expand Up @@ -341,13 +341,26 @@ def authed(params):
}))

for t in r['tags']:

ts_private = '0'
ts_protected = '0'
ts_public = '0'
ts_level = '0'

if 'security' in t:
s = t['security']
if 'private' in s: ts_private = s['private']
if 'protected' in s: ts_protected = s['protected']
if 'public' in s: ts_public = s['public']
if 'level' in s: ts_level = s['level']

insert_or_update_tag(cur, verbose,
{ 'name': t['name'],
'display': t['display'],
'security_private': t['security']['private'],
'security_protected': t['security']['protected'],
'security_public': t['security']['public'],
'security_level': t['display'],
'security_private': ts_private,
'security_protected': ts_protected,
'security_public': ts_public,
'security_level': ts_level,
'uses': t['uses']})

#
Expand Down
4 changes: 2 additions & 2 deletions ljdumpsqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
#
# ljdumpsqlite.py - SQLite support tools for livejournal archiver
# Version 1.7.5
# Version 1.7.6
#
# LICENSE
#
Expand Down Expand Up @@ -398,7 +398,7 @@ def insert_or_update_event(cur, verbose, ev):

"props_commentalter": ev['props'].get("commentalter", None),
"props_current_moodid": ev['props'].get("current_moodid", None),
"props_current_music": ev['props'].get("current_music", None),
"props_current_music": str(ev['props'].get("current_music", None)),
"props_import_source": ev['props'].get("import_source", None),
"props_interface": ev['props'].get("interface", None),
"props_opt_backdated": ev['props'].get("opt_backdated", None),
Expand Down
8 changes: 4 additions & 4 deletions ljdumptohtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#
# ljdumptohtml.py - convert sqlite livejournal archive to html pages
# Garrett Birkel et al
# Version 1.7.5
# Version 1.7.6
#
# LICENSE
#
Expand Down Expand Up @@ -464,14 +464,14 @@ def create_single_entry_page(journal_short_name, entry, comments, image_urls_to_
bottomnav_a.text = u"Next Entry"

# We're going to be weird here, because journal entries often contain weird and
# broken HTML. We really can't rely on parsing a journal entry into XML and then
# broken HTML. We can't rely on parsing a journal entry into XML and then
# embedding it as elements. There is also no clean way to slipstream string data
# into the XML during the rendering process (it either gets parsed as usual before
# insertion, or run through an escaper). So we're going to render the document as
# text right here, and then do a text search (a split) to find the div with id
# "entry-content-insertion-point". Then we'll interleave the entry contents and
# re-assemble the document. It's hacky but it avoids the need to police the HTML
# skills of thousands of users whose entires render fine in Dreamwidth.
# skills of thousands of users whose entries render fine in Dreamwidth.
html_as_string = ET.tostring(page, encoding="utf-8", method="html").decode('utf-8')
html_split_on_entry_body = html_as_string.split(u'<div class="entry-content" id="entry-content-insertion-point"></div>')

Expand Down Expand Up @@ -642,7 +642,7 @@ def download_entry_image(img_url, journal_short_name, subfolder, image_id, entry
# Only necessary for Dreamwidth-hosted images, but does no harm generally.
headers = {'Referer': entry_url, 'Cookie': "ljuniq="+ljuniq}

image_req = urllib.request.urlopen(urllib.request.Request(img_url, headers = headers), timeout = 5)
image_req = urllib.request.urlopen(urllib.request.Request(img_url, headers = headers), timeout = 4)
if image_req.headers.get_content_maintype() != 'image':
print('Content type %s not expected, image skipped: %s' % (image_req.headers.get_content_maintype(), img_url))
return (1, None)
Expand Down

0 comments on commit 98430f1

Please sign in to comment.