Skip to content

Commit

Permalink
Implement configurable getDocumentID in DeletionBolt (#1135)
Browse files Browse the repository at this point in the history
We already have configurable documentID generator in IndexerBolt and
StatusUpdaterBolt, we should also have same feature in DeletionBolt.

Signed-off-by: Chun-Han Hsiao <chhsiao90@diffbot.com>
Co-authored-by: Chun-Han Hsiao <chhsiao90@diffbot.com>
  • Loading branch information
chhsiao90 and Chun-Han Hsiao authored Dec 12, 2023
1 parent 7f70a47 commit 31a4b2a
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ public void execute(Tuple tuple) {

// keep it simple for now and ignore cases where the canonical URL was
// used
String sha256hex = org.apache.commons.codec.digest.DigestUtils.sha256Hex(url);
DeleteRequest dr = new DeleteRequest(getIndexName(metadata), sha256hex);
String docID = getDocumentID(metadata, url);
DeleteRequest dr = new DeleteRequest(getIndexName(metadata), docID);
try {
client.delete(dr, RequestOptions.DEFAULT);
} catch (IOException e) {
Expand All @@ -80,6 +80,17 @@ public void execute(Tuple tuple) {
_collector.ack(tuple);
}

/**
* Get the document id.
*
* @param metadata The {@link Metadata}.
* @param url The normalised url.
* @return Return the normalised url SHA-256 digest as String.
*/
protected String getDocumentID(Metadata metadata, String url) {
return org.apache.commons.codec.digest.DigestUtils.sha256Hex(url);
}

@Override
public void declareOutputFields(OutputFieldsDeclarer arg0) {
// none
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public void execute(Tuple tuple) {
// keep it simple for now and ignore cases where the canonical URL was
// used

final String docID = org.apache.commons.codec.digest.DigestUtils.sha256Hex(url);
final String docID = getDocumentID(metadata, url);
DeleteRequest dr = new DeleteRequest(getIndexName(metadata), docID);
connection.addToProcessor(dr);

Expand Down Expand Up @@ -161,6 +161,17 @@ protected String getIndexName(Metadata m) {
return indexName;
}

/**
* Get the document id.
*
* @param metadata The {@link Metadata}.
* @param url The normalised url.
* @return Return the normalised url SHA-256 digest as String.
*/
protected String getDocumentID(Metadata metadata, String url) {
return org.apache.commons.codec.digest.DigestUtils.sha256Hex(url);
}

@Override
public void beforeBulk(long executionId, BulkRequest request) {}

Expand Down

0 comments on commit 31a4b2a

Please sign in to comment.