diff --git a/ADSCitationCapture/db.py b/ADSCitationCapture/db.py index 126b1e4..c33aa41 100644 --- a/ADSCitationCapture/db.py +++ b/ADSCitationCapture/db.py @@ -101,6 +101,27 @@ def update_citation_target_metadata(app, content, raw_metadata, parsed_metadata, metadata_updated = _update_citation_target_metadata_session(session, content, raw_metadata, parsed_metadata, curated_metadata, status=status, bibcode=bibcode, associated=associated) return metadata_updated +def _update_citation_target_curator_message_session(session, content, msg): + """ + Actual calls to database session for update_citation_target_metadata + """ + citation_target = session.query(CitationTarget).filter(CitationTarget.content == content).first() + if citation_target: + citation_target.curated_metadata = msg + session.add(citation_target) + session.commit() + msg_updated = True + return msg_updated + +def update_citation_target_curator_message(app, content, msg): + """ + Update metadata for a citation target + """ + msg_updated = False + with app.session_scope() as session: + msg_updated = _update_citation_target_curator_message_session(session, content, msg) + return msg_updated + def store_citation(app, citation_change, content_type, raw_metadata, parsed_metadata, status): """ Stores a new citation in the DB diff --git a/ADSCitationCapture/tasks.py b/ADSCitationCapture/tasks.py index e2d49e5..b752301 100644 --- a/ADSCitationCapture/tasks.py +++ b/ADSCitationCapture/tasks.py @@ -601,155 +601,160 @@ def task_maintenance_curation(dois, bibcodes, curated_entries, reset=False): #Try by doi. if curated_entry.get('doi'): - registered_record = db.get_citation_targets_by_doi(app, [curated_entry.get('doi')], only_status='REGISTERED')[0] + registered_records = db.get_citation_targets_by_doi(app, [curated_entry.get('doi')], only_status='REGISTERED') #If not, retrieve entry by bibcode. elif curated_entry.get('bibcode'): - registered_record = db.get_citation_targets_by_bibcode(app, [curated_entry.get('bibcode')], only_status='REGISTERED')[0] + registered_records = db.get_citation_targets_by_bibcode(app, [curated_entry.get('bibcode')], only_status='REGISTERED') #report error else: logger.error('Unable to retrieve entry for {} from database. Please check input file.'.format(curated_entry)) - metadata = db.get_citation_target_metadata(app, registered_record.get('content', ''), curate=False) - raw_metadata = metadata.get('raw', '') - parsed_metadata = metadata.get('parsed', '') - #remove doi and bibcode from metadata to be stored in db. - for key in ['bibcode','doi']: + if registered_records: + registered_record = registered_records[0] + metadata = db.get_citation_target_metadata(app, registered_record.get('content', ''), curate=False) + raw_metadata = metadata.get('raw', '') + parsed_metadata = metadata.get('parsed', '') + #remove doi and bibcode from metadata to be stored in db. + for key in ['bibcode','doi']: + try: + curated_entry.pop(key) + except KeyError as e: + logger.warn("Failed to remove key: {} with error {}. Key likely not in curated_metadata.".format(key, e)) + continue try: - curated_entry.pop(key) - except KeyError as e: - logger.warn("Failed to remove key: {} with error {}. Key likely not in curated_metadata.".format(key, e)) - continue - try: - if not reset: - if 'authors' in curated_entry.keys(): - #checks to make sure authors are in a list. Errors out if not. - if isinstance(curated_entry.get('authors', []), list): - curated_entry['normalized_authors'] = doi.renormalize_author_names(curated_entry.get('authors', None)) - else: - logger.error("'author' key is not a list of authors. Stopping.") - err = "'authors' is not a valid list of strings" - raise TypeError(err) - #only check old metadata if we are adding updates, otherwise ignore. - if curated_entry != registered_record.get('curated_metadata'): - for key in registered_record['curated_metadata'].keys(): - #first apply any previous edits to metadata that are not overwritten by new metadata. - if key != "error" and key not in curated_entry.keys(): - curated_entry[key] = registered_record['curated_metadata'][key] - else: - logger.warn("Supplied metadata is identical to previously added metadata. No updates will occur.") - logger.debug("Curated entry: {}".format(curated_entry)) - modified_metadata = db.generate_modified_metadata(parsed_metadata, curated_entry) - logger.debug("Modified bibcode {}".format(modified_metadata.get('bibcode'))) - #regenerate bibcode with curated_metadata and append old bibcode to alternate_bibcode - zenodo_bibstem = "zndo" - #generates new bibcodes with manual curation data - new_bibcode = doi.build_bibcode(modified_metadata, doi.zenodo_doi_re, zenodo_bibstem) - modified_metadata['bibcode'] = new_bibcode - #get the original list of alt bibcodes - alternate_bibcode = registered_record.get('alternate_bibcode', []) - #set parsed_metadata alt bibcodes to match original list - parsed_metadata['alternate_bibcode'] = registered_record.get('alternate_bibcode', []) - #checks for provided alt bibcodes from manual curation - if 'alternate_bibcode' in curated_entry.keys(): - #checks to make sure alternate_bibcodes are in a list. Errors out if not. - if isinstance(curated_entry.get('alternate_bibcode', []), list): - alternate_bibcode = list(set(alternate_bibcode+curated_entry['alternate_bibcode'])) - logger.debug('alternate bibcodes are {}'.format(alternate_bibcode)) + if not reset: + if 'authors' in curated_entry.keys(): + #checks to make sure authors are in a list. Errors out if not. + if isinstance(curated_entry.get('authors', []), list): + curated_entry['normalized_authors'] = doi.renormalize_author_names(curated_entry.get('authors', None)) + else: + logger.error("'author' key is not a list of authors. Stopping.") + err = "'authors' is not a valid list of strings" + raise TypeError(err) + #only check old metadata if we are adding updates, otherwise ignore. + if curated_entry != registered_record.get('curated_metadata'): + for key in registered_record['curated_metadata'].keys(): + #first apply any previous edits to metadata that are not overwritten by new metadata. + if key != "error" and key not in curated_entry.keys(): + curated_entry[key] = registered_record['curated_metadata'][key] else: - logger.error("'alternate_bibcodes' key is not a list of alternate_bibcodes. Stopping.") - err = "'alternate_bibcodes' is not a valid list of bibcodes" - raise TypeError(err) - - #checks to make sure the main bibcode is not in the alt bibcodes - try: - alternate_bibcode.remove(modified_metadata.get('bibcode')) - except: - pass - #checks if bibcode has changed due to manual curation metadata - if new_bibcode != registered_record.get('bibcode'): - logger.warn("Parsing the new metadata for citation target '%s' produced a different bibcode: '%s'. The former will be moved to the 'alternate_bibcode' list, and the new one will be used as the main one.", registered_record['bibcode'],new_bibcode) - if registered_record.get('bibcode') not in alternate_bibcode: - #generate complete alt bibcode list including any curated entries - alternate_bibcode.append(registered_record.get('bibcode')) - #Add the CC generated bibcode to the parsed metadata - parsed_metadata['alternate_bibcode'].append(registered_record.get('bibcode')) - #removes duplicates from parsed_metadata alt bibcodes - parsed_metadata['alternate_bibcode'] = list(set(parsed_metadata.get('alternate_bibcode'))) - #sets new bibcode - modified_metadata['bibcode'] = new_bibcode - #removes duplicates from all alt bibcodes including ones provided by manual curation - alternate_bibcode = list(set(alternate_bibcode)) - #updates curated entry alt bibcodes only if a new bibcode is generated due to manual curation - curated_entry['alternate_bibcode'] = alternate_bibcode - #marks bibcode as replaced - bibcode_replaced = {'previous': registered_record['bibcode'], 'new': new_bibcode} - #sets modified metadata alt bibcodes to match the full list of alt bibcodes. - modified_metadata['alternate_bibcode'] = alternate_bibcode - - else: - #Check to see if curated_metadata exists for the record. - if registered_record['curated_metadata']: - #Repopulate parsed_metadata with expected bibcode information from parsed_cited_metadata. - logger.debug("Resetting citation to original parsed metadata") - #regenerate bibcode with parsed_metadata and append old bibcode to alternate_bibcode + logger.warn("Supplied metadata is identical to previously added metadata. No updates will occur.") + logger.debug("Curated entry: {}".format(curated_entry)) + modified_metadata = db.generate_modified_metadata(parsed_metadata, curated_entry) + logger.debug("Modified bibcode {}".format(modified_metadata.get('bibcode'))) + #regenerate bibcode with curated_metadata and append old bibcode to alternate_bibcode zenodo_bibstem = "zndo" - new_bibcode = doi.build_bibcode(parsed_metadata, doi.zenodo_doi_re, zenodo_bibstem) - parsed_metadata['bibcode'] = new_bibcode - #get original alt bibcodes + #generates new bibcodes with manual curation data + new_bibcode = doi.build_bibcode(modified_metadata, doi.zenodo_doi_re, zenodo_bibstem) + modified_metadata['bibcode'] = new_bibcode + #get the original list of alt bibcodes alternate_bibcode = registered_record.get('alternate_bibcode', []) + #set parsed_metadata alt bibcodes to match original list parsed_metadata['alternate_bibcode'] = registered_record.get('alternate_bibcode', []) - #reset bibcode if changed + #checks for provided alt bibcodes from manual curation + if 'alternate_bibcode' in curated_entry.keys(): + #checks to make sure alternate_bibcodes are in a list. Errors out if not. + if isinstance(curated_entry.get('alternate_bibcode', []), list): + alternate_bibcode = list(set(alternate_bibcode+curated_entry['alternate_bibcode'])) + logger.debug('alternate bibcodes are {}'.format(alternate_bibcode)) + else: + logger.error("'alternate_bibcodes' key is not a list of alternate_bibcodes. Stopping.") + err = "'alternate_bibcodes' is not a valid list of bibcodes" + raise TypeError(err) + + #checks to make sure the main bibcode is not in the alt bibcodes + try: + alternate_bibcode.remove(modified_metadata.get('bibcode')) + except: + pass + #checks if bibcode has changed due to manual curation metadata if new_bibcode != registered_record.get('bibcode'): logger.warn("Parsing the new metadata for citation target '%s' produced a different bibcode: '%s'. The former will be moved to the 'alternate_bibcode' list, and the new one will be used as the main one.", registered_record['bibcode'],new_bibcode) - #Add old bibcode to alt bibcodes if registered_record.get('bibcode') not in alternate_bibcode: + #generate complete alt bibcode list including any curated entries alternate_bibcode.append(registered_record.get('bibcode')) - #set bibcode replaced if necessary - bibcode_replaced = {'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } - #set alt bibcodes to full list but try and remove main bibcode from alt list - try: - alternate_bibcode.remove(parsed_metadata.get('bibcode')) - except: - pass - parsed_metadata['alternate_bibcode'] = list(set(alternate_bibcode)) - #reset modified metadata - modified_metadata = parsed_metadata - #clear curated metadata - curated_entry = {} + #Add the CC generated bibcode to the parsed metadata + parsed_metadata['alternate_bibcode'].append(registered_record.get('bibcode')) + #removes duplicates from parsed_metadata alt bibcodes + parsed_metadata['alternate_bibcode'] = list(set(parsed_metadata.get('alternate_bibcode'))) + #sets new bibcode + modified_metadata['bibcode'] = new_bibcode + #removes duplicates from all alt bibcodes including ones provided by manual curation + alternate_bibcode = list(set(alternate_bibcode)) + #updates curated entry alt bibcodes only if a new bibcode is generated due to manual curation + curated_entry['alternate_bibcode'] = alternate_bibcode + #marks bibcode as replaced + bibcode_replaced = {'previous': registered_record['bibcode'], 'new': new_bibcode} + #sets modified metadata alt bibcodes to match the full list of alt bibcodes. + modified_metadata['alternate_bibcode'] = alternate_bibcode + else: - modified_metadata = parsed_metadata - logger.warn("Cannot delete curated metadata for {}. No curated metadata exists.".format(registered_record.get('content', ''))) - - different_bibcodes = registered_record['bibcode'] != modified_metadata['bibcode'] - if different_bibcodes: - event_data = webhook.identical_bibcodes_event_data(registered_record['bibcode'], modified_metadata['bibcode']) - if event_data: - dump_prefix = datetime.now().strftime("%Y%m%d") # "%Y%m%d_%H%M%S" - logger.debug("Calling 'task_emit_event' for '%s' IsIdenticalTo '%s'", registered_record['bibcode'], modified_metadata['bibcode']) - task_emit_event.delay(event_data, dump_prefix) + #Check to see if curated_metadata exists for the record. + if registered_record['curated_metadata']: + #Repopulate parsed_metadata with expected bibcode information from parsed_cited_metadata. + logger.debug("Resetting citation to original parsed metadata") + #regenerate bibcode with parsed_metadata and append old bibcode to alternate_bibcode + zenodo_bibstem = "zndo" + new_bibcode = doi.build_bibcode(parsed_metadata, doi.zenodo_doi_re, zenodo_bibstem) + parsed_metadata['bibcode'] = new_bibcode + #get original alt bibcodes + alternate_bibcode = registered_record.get('alternate_bibcode', []) + parsed_metadata['alternate_bibcode'] = registered_record.get('alternate_bibcode', []) + #reset bibcode if changed + if new_bibcode != registered_record.get('bibcode'): + logger.warn("Parsing the new metadata for citation target '%s' produced a different bibcode: '%s'. The former will be moved to the 'alternate_bibcode' list, and the new one will be used as the main one.", registered_record['bibcode'],new_bibcode) + #Add old bibcode to alt bibcodes + if registered_record.get('bibcode') not in alternate_bibcode: + alternate_bibcode.append(registered_record.get('bibcode')) + #set bibcode replaced if necessary + bibcode_replaced = {'previous': registered_record['bibcode'], 'new': parsed_metadata['bibcode'] } + #set alt bibcodes to full list but try and remove canonical bibcode from alt list + try: + alternate_bibcode.remove(parsed_metadata.get('bibcode')) + except: + #we pass because this just means the canonical bibcode is not in the list of alt bibcodes + pass + parsed_metadata['alternate_bibcode'] = list(set(alternate_bibcode)) + #reset modified metadata + modified_metadata = parsed_metadata + #clear curated metadata + curated_entry = {} + else: + modified_metadata = parsed_metadata + logger.warn("Cannot delete curated metadata for {}. No curated metadata exists.".format(registered_record.get('content', ''))) - updated = db.update_citation_target_metadata(app, registered_record['content'], raw_metadata, parsed_metadata, curated_metadata=curated_entry, bibcode=modified_metadata.get('bibcode'), associated=registered_record.get('associated_works', {"":""})) - if updated: - citation_change = adsmsg.CitationChange(content=registered_record['content'], - content_type=getattr(adsmsg.CitationChangeContentType, registered_record['content_type'].lower()), - status=adsmsg.Status.updated, - timestamp=datetime.now() - ) - if citation_change.content_type == adsmsg.CitationChangeContentType.doi: - # Get citations from the database and transform the stored bibcodes into their canonical ones as registered in Solr. - original_citations = db.get_citations_by_bibcode(app, registered_record['bibcode']) - citations = api.get_canonical_bibcodes(app, original_citations) - logger.debug("Calling 'task_output_results' with '%s'", citation_change) - task_output_results.delay(citation_change, modified_metadata, citations, bibcode_replaced=bibcode_replaced, db_versions=registered_record.get('associated_works', {"":""})) - else: - logger.warn("Curated metadata did not result in a change to recorded metadata for {}.".format(registered_record.get('content'))) - except Exception as e: - err = "task_maintenance_curation Failed to update metadata for {} with Exception: {}. Please check the input data and try again.".format(curated_entry, e) - err_dict = registered_record.get('curated_metadata', {}) - err_dict['error'] = err - db.update_citation_target_curator_message(app, registered_record['content'], err_dict) - logger.exception(err) - raise + different_bibcodes = registered_record['bibcode'] != modified_metadata['bibcode'] + if different_bibcodes: + event_data = webhook.identical_bibcodes_event_data(registered_record['bibcode'], modified_metadata['bibcode']) + if event_data: + dump_prefix = datetime.now().strftime("%Y%m%d") # "%Y%m%d_%H%M%S" + logger.debug("Calling 'task_emit_event' for '%s' IsIdenticalTo '%s'", registered_record['bibcode'], modified_metadata['bibcode']) + task_emit_event.delay(event_data, dump_prefix) + + updated = db.update_citation_target_metadata(app, registered_record['content'], raw_metadata, parsed_metadata, curated_metadata=curated_entry, bibcode=modified_metadata.get('bibcode'), associated=registered_record.get('associated_works', {"":""})) + if updated: + citation_change = adsmsg.CitationChange(content=registered_record['content'], + content_type=getattr(adsmsg.CitationChangeContentType, registered_record['content_type'].lower()), + status=adsmsg.Status.updated, + timestamp=datetime.now() + ) + if citation_change.content_type == adsmsg.CitationChangeContentType.doi: + # Get citations from the database and transform the stored bibcodes into their canonical ones as registered in Solr. + original_citations = db.get_citations_by_bibcode(app, registered_record['bibcode']) + citations = api.get_canonical_bibcodes(app, original_citations) + logger.debug("Calling 'task_output_results' with '%s'", citation_change) + task_output_results.delay(citation_change, modified_metadata, citations, bibcode_replaced=bibcode_replaced, db_versions=registered_record.get('associated_works', {"":""})) + else: + logger.warn("Curated metadata did not result in a change to recorded metadata for {}.".format(registered_record.get('content'))) + except Exception as e: + err = "task_maintenance_curation Failed to update metadata for {} with Exception: {}. Please check the input data and try again.".format(curated_entry, e) + err_dict = registered_record.get('curated_metadata', {}) + err_dict['error'] = err + db.update_citation_target_curator_message(app, registered_record['content'], err_dict) + logger.exception(err) + raise + else: + logger.error('Unable to retrieve entry for {} from database. Please check input file.'.format(curated_entry)) def maintenance_show_metadata(curated_entries): """ @@ -760,9 +765,9 @@ def maintenance_show_metadata(curated_entries): if curated_entry.get('doi'): try: registered_record = db.get_citation_targets_by_doi(app, [curated_entry.get('doi')], only_status='REGISTERED')[0] - except Exception as e: + except Exception: msg = "Failed to retrieve citation target {}. Please confirm information is correct and citation target is in database.".format(curated_entry) - logger.error(msg) + logger.exception(msg) raise Exception(msg) custom_citation_change = adsmsg.CitationChange(content=registered_record['content'], @@ -779,17 +784,17 @@ def maintenance_show_metadata(curated_entries): if "error" in curated.keys(): print("\n The most recent attempt to curate metadata failed with the following error: {}".format(curated.get("error", ""))) - except Exception as e: + except Exception: msg = "Failed to load metadata for citation {}. Please confirm information is correct and citation target is in database.".format(curated_entry) - logger.error(msg) + logger.exception(msg) #If no doi, try and retrieve entry by bibcode. elif curated_entry.get('bibcode'): try: registered_record = db.get_citation_targets_by_bibcode(app, [curated_entry.get('bibcode')], only_status='REGISTERED')[0] - except Exception as e: + except Exception: msg = "Failed to retrieve citation target {}. Please confirm information is correct and citation target is in database.".format(curated_entry) - logger.error(msg) + logger.exception(msg) raise Exception(msg) custom_citation_change = adsmsg.CitationChange(content=registered_record['content'], @@ -806,9 +811,9 @@ def maintenance_show_metadata(curated_entries): if "error" in curated.keys(): print("\n The most recent attempt to curate metadata failed with the following error: {}".format(curated.get("error", ""))) - except Exception as e: + except Exception: msg = "Failed to load metadata for citation {}. Please confirm information is correct and citation target is in database.".format(curated_entry) - logger.error(msg) + logger.exception(msg) @app.task(queue='maintenance_metadata') def task_maintenance_repopulate_bibcode_columns(): diff --git a/README.md b/README.md index ae6a18a..7ef9d23 100644 --- a/README.md +++ b/README.md @@ -533,7 +533,7 @@ python3 run.py MAINTENANCE --curation --bibcode "YYYYzndo...BCDEFGR" --show If the `"authors"` key is specified, CitationCapture will recalculate the `"normalized_authors"` field automatically. -**NOTE: Any attribute that has a list specified as the value must be given the entire list including any unedited entries ie. If you edit a single author name, the entire author list must be included in the `curated_metadata`.** +**NOTE: Any attribute that has a list specified as the value must be given the entire list including any unedited entries ie. If you edit a single author name, the entire author list must be included in the `curated_metadata`. The lone exception being `alternate_bibcode`** For clearing `curated_metadata` by input file, only the `doi` or `bibcode` needs to be specified in the file. Any other details entered into the entry will be ignored. @@ -541,7 +541,9 @@ Alternate bibcodes are handled in a slightly different manner. Any bibcode that **NOTE: the `json` keys must be contained in `" "` not `' '` or else the entire process will error out. `--show` now returns the proper format by default.** -By default. `--show` displays a the metadata as a single line. This is the required format for any metadata updates specified in `--input_filename` or `--json`. To make the text more readable you can pipe the output into `jq` +If an error occurs during curation, the error will be saved into the `curated_metadata` field. Any previous curated metadata will be retained and `--show` will return the current metadata as well as the error message on a separate line. + +By default. `--show` displays the metadata as a single line. This is the required format for any metadata updates specified in `--input_filename` or `--json`. To make the text more readable you can pipe the output into `jq` ``` python3 run.py MAINTENANCE --curation --doi 10.5281/zenodo.123567 --show | jq .