Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement configurable getDocumentID in DeletionBolt #1135

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ public void execute(Tuple tuple) {

// keep it simple for now and ignore cases where the canonical URL was
// used
String sha256hex = org.apache.commons.codec.digest.DigestUtils.sha256Hex(url);
DeleteRequest dr = new DeleteRequest(getIndexName(metadata), sha256hex);
String docID = getDocumentID(metadata, url);
DeleteRequest dr = new DeleteRequest(getIndexName(metadata), docID);
try {
client.delete(dr, RequestOptions.DEFAULT);
} catch (IOException e) {
Expand All @@ -80,6 +80,17 @@ public void execute(Tuple tuple) {
_collector.ack(tuple);
}

/**
* Get the document id.
*
* @param metadata The {@link Metadata}.
* @param url The normalised url.
* @return Return the normalised url SHA-256 digest as String.
*/
protected String getDocumentID(Metadata metadata, String url) {
return org.apache.commons.codec.digest.DigestUtils.sha256Hex(url);
}

@Override
public void declareOutputFields(OutputFieldsDeclarer arg0) {
// none
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public void execute(Tuple tuple) {
// keep it simple for now and ignore cases where the canonical URL was
// used

final String docID = org.apache.commons.codec.digest.DigestUtils.sha256Hex(url);
final String docID = getDocumentID(metadata, url);
DeleteRequest dr = new DeleteRequest(getIndexName(metadata), docID);
connection.addToProcessor(dr);

Expand Down Expand Up @@ -161,6 +161,17 @@ protected String getIndexName(Metadata m) {
return indexName;
}

/**
* Get the document id.
*
* @param metadata The {@link Metadata}.
* @param url The normalised url.
* @return Return the normalised url SHA-256 digest as String.
*/
protected String getDocumentID(Metadata metadata, String url) {
return org.apache.commons.codec.digest.DigestUtils.sha256Hex(url);
}

@Override
public void beforeBulk(long executionId, BulkRequest request) {}

Expand Down