made cli and xml dump the same

nulib · Apr 2, 2024 · 37fcebc · 37fcebc
1 parent 8182271
commit 37fcebc
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 13 deletions.
diff --git a/nuldc/commandline.py b/nuldc/commandline.py
@@ -63,7 +63,7 @@ def main():
         data = {"message": "saved csv to :" + args['<outfile>']}
 
     if args["xml"]:
-        # saving xml
+        # saving xml 
         helpers.save_xml(data, args['<outfile>'])
         data = {"message": "saved xml to :" + args['<outfile>']}
 

diff --git a/nuldc/dump.py b/nuldc/dump.py
@@ -52,7 +52,7 @@ def save_files(basename, data):
     with open(f"json/{basename}.json", 'w', encoding='utf-8') as f:
         json.dump(data.get('data'), f)
 
-    helpers.save_xml(data.get('data'), f'xml/{basename}.xml')
+    helpers.save_xml(data, f'xml/{basename}.xml')
 
     headers, values = helpers.sort_fields_and_values(data)
     helpers.save_as_csv(headers, values, f'csv/{basename}.csv')
@@ -63,13 +63,15 @@ def dump_collection(col_id):
     json, xml, and csv files"""
 
     params = {
-        "query": f"collection.id:{col_id}",
+        "query": f"collection.id: {col_id}",
         "size": "50",
         "sort": "id:asc"}
     try:
         data = helpers.get_search_results(API,
-                                      "works",
-                                          params, all_results=True, page_limit=5000)
+                                          "works",
+                                          params,
+                                          all_results=True,
+                                          page_limit=5000)
         col_title = data['data'][0]['collection']['title']
         filename = f"{slugify(col_title)}-{col_id}"
         save_files(filename, data)
@@ -106,7 +108,7 @@ def main():
         with open('_updated_at.txt') as f:
             updated = f.readline().strip()
 
-        query = f"indexed_at:>={updated}"
+        query = f"indexed_at: >={updated}"
         print(f"looking for collections with works updated since {query}")
     else:
         print("can't find updated since file, rebuilding all collections")

diff --git a/nuldc/helpers.py b/nuldc/helpers.py
@@ -174,19 +174,20 @@ def save_as_csv(headers, values, output_file):
             writer.writerow(row)
 
 
-def save_xml(data, output_file):
+def save_xml(opensearch_results, output_file):
     """takes results as a list of dicts and writes them out to xml"""
-
 
     # TODO DRY up this bit and sort_fields_and_values
 
     ignore_fields = ['embedding', 'embedding_model']
-
+    # massage the data and remove the embeddings
+    data = opensearch_results.get('data')
     data = [{key: value
              for (key, value) in sorted(d.items())
-             if key not in ignore_fields} for d in data.get('data')]
-    xml = dicttoxml.dicttoxml(data, attr_type=False)
-
+             if key not in ignore_fields} for d in data]
+    opensearch_results['data'] = data
+    xml = dicttoxml.dicttoxml(opensearch_results, attr_type=False)
+
     with open(output_file, 'wb') as xmlfile:
         xmlfile.write(xml)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "nuldc"
-version = "0.10.1"
+version = "0.10.2"
 description = ""
 authors = ["davidschober <davidschob@gmail.com>"]
 license = "MIT"