Skip to content

Commit

Permalink
Merge pull request #4547 from unicef/south-sudan-update-script-fix
Browse files Browse the repository at this point in the history
South sudan update script fix
  • Loading branch information
johniak authored Jan 16, 2025
2 parents 50c0db1 + 103f00f commit b185185
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@


@transaction.atomic
def south_sudan_update_script(file_path: str, program_id: str) -> None:
def south_sudan_update_script(file_path: str, program_id: str, batch_size: int) -> None:
program = Program.objects.get(id=program_id)
business_area = program.business_area
update = UniversalIndividualUpdateScript(
Expand All @@ -61,5 +61,6 @@ def south_sudan_update_script(file_path: str, program_id: str) -> None:
document_fields,
deduplicate_documents=True,
deduplicate_es=True,
batch_size=batch_size,
)
update.execute()
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,19 @@

from hct_mis_api.apps.core.models import BusinessArea
from hct_mis_api.apps.geo.models import Country
from hct_mis_api.apps.household.models import Document, DocumentType, Individual
from hct_mis_api.apps.household.documents import HouseholdDocument, get_individual_doc
from hct_mis_api.apps.household.models import (
Document,
DocumentType,
Household,
Individual,
)
from hct_mis_api.apps.program.models import Program
from hct_mis_api.apps.registration_datahub.tasks.deduplicate import (
DeduplicateTask,
HardDocumentDeduplication,
)
from hct_mis_api.apps.utils.elasticsearch_utils import populate_index


class UniversalIndividualUpdateScript:
Expand All @@ -26,6 +33,7 @@ def __init__(
ignore_empty_values: bool = True,
deduplicate_es: bool = True,
deduplicate_documents: bool = True,
batch_size: int = 100,
) -> None:
self.business_area = business_area
self.program = program
Expand All @@ -38,7 +46,9 @@ def __init__(
self.deduplicate_es = deduplicate_es
self.deduplicate_documents = deduplicate_documents
document_types = DocumentType.objects.filter()
self.countries = {country.name: country for country in Country.objects.all()}
self.document_types = {f"{document_type.key}_no_i_c": document_type for document_type in document_types}
self.batch_size = batch_size

def validate_household_fields(
self, row: Tuple[Any, ...], headers: List[str], household: Any, row_index: int
Expand Down Expand Up @@ -79,7 +89,10 @@ def validate_documents(
errors = []
for number_column_name, country_column_name in self.document_fields:
document_type = self.document_types.get(number_column_name)
number_text = row[headers.index(number_column_name)]
country_text = row[headers.index(country_column_name)]
if country_text is None and number_text is None:
continue
country = Country.objects.filter(name=country_text).first()
if country is None:
errors.append(
Expand All @@ -96,8 +109,8 @@ def validate(self, sheet: Worksheet, headers: List[str]) -> List[str]:
row_index = 1
for row in sheet.iter_rows(min_row=2, values_only=True):
row_index += 1
if (row_index - 2) % 1000 == 0:
print(f"Validating row {row_index - 2} to {row_index - 2 + 100} Indivduals")
if (row_index - 2) % self.batch_size == 0:
print(f"Validating row {row_index - 2} to {row_index - 2 + self.batch_size} Indivduals")
unicef_id = row[headers.index("unicef_id")]
individuals_queryset = Individual.objects.filter(
unicef_id=unicef_id, business_area=self.business_area, program=self.program
Expand Down Expand Up @@ -143,49 +156,117 @@ def handle_individual_flex_update(self, row: Tuple[Any, ...], headers: List[str]
continue
individual.flex_fields[name] = handled_value

def handle_documents_update(self, row: Tuple[Any, ...], headers: List[str], individual: Individual) -> None:
def handle_documents_update(
self, row: Tuple[Any, ...], headers: List[str], individual: Individual
) -> Tuple[list, list]:
documents_to_update = []
documents_to_create = []
for number_column_name, country_column_name in self.document_fields:
document_type = self.document_types.get(number_column_name)
document_number = row[headers.index(number_column_name)]
document_country = row[headers.index(country_column_name)]
country = Country.objects.filter(name=document_country).first()
if self.ignore_empty_values and (document_number is None or document_number == ""): # pragma: no cover
continue
document = individual.documents.filter(type=document_type).first()
country = self.countries[document_country]
document = None
for doc in individual.documents.all():
if doc.type == document_type:
document = doc
break
if document:
document.document_number = document_number
document.status = Document.STATUS_PENDING
document.save()
document.country = country
documents_to_update.append(document)
else:
Document.objects.create(
individual=individual,
type=document_type,
document_number=document_number,
country=country,
rdi_merge_status="MERGED",
documents_to_create.append(
Document(
individual=individual,
type=document_type,
document_number=document_number,
country=country,
rdi_merge_status="MERGED",
)
)
return documents_to_update, documents_to_create

def handle_update(self, sheet: Worksheet, headers: List[str]) -> List[str]:
row_index = 1
individual_ids = []
household_fields_to_update = ["flex_fields"]
individual_fields_to_update = ["flex_fields"]
document_fields_to_create = ["document_number", "status", "country"]
household_fields_to_update.extend([field for _, (field, _, _) in self.household_fields.items()])
individual_fields_to_update.extend([field for _, (field, _, _) in self.individual_fields.items()])
individuals_to_update = []
households_to_update = []
documents_to_update = []
documents_to_create = []
for row in sheet.iter_rows(min_row=2, values_only=True):
row_index += 1
if (row_index - 2) % 1000 == 0:
print(f"Updating row {row_index - 2} to {row_index - 2 + 100} Individuals")
if (row_index - 2) % self.batch_size == 0:
print(f"Updating row {row_index - 2} to {row_index - 2 + self.batch_size} Individuals")
unicef_id = row[headers.index("unicef_id")]
individual = Individual.objects.filter(
unicef_id=unicef_id, business_area=self.business_area, program=self.program
).first()
individual = (
Individual.objects.select_related("household")
.prefetch_related("documents")
.get(unicef_id=unicef_id, business_area=self.business_area, program=self.program)
)
individual_ids.append(str(individual.id))
household = individual.household
self.handle_household_update(row, headers, household)
self.handle_individual_update(row, headers, individual)
self.handle_individual_flex_update(row, headers, individual)
self.handle_documents_update(row, headers, individual)
household.save()
individual.save()
documents_to_update_part, documents_to_create_part = self.handle_documents_update(row, headers, individual)
documents_to_update.extend(documents_to_update_part)
documents_to_create.extend(documents_to_create_part)
households_to_update.append(household)
individuals_to_update.append(individual)
if len(individuals_to_update) == self.batch_size:
self.batch_update(
document_fields_to_create,
documents_to_create,
documents_to_update,
household_fields_to_update,
households_to_update,
individual_fields_to_update,
individuals_to_update,
)
households_to_update = []
individuals_to_update = []
self.batch_update(
document_fields_to_create,
documents_to_create,
documents_to_update,
household_fields_to_update,
households_to_update,
individual_fields_to_update,
individuals_to_update,
)
return individual_ids

def batch_update(
self,
document_fields_to_create: list,
documents_to_create: list,
documents_to_update: list,
household_fields_to_update: list,
households_to_update: list,
individual_fields_to_update: list,
individuals_to_update: list,
) -> None:
Document.objects.bulk_update(documents_to_update, document_fields_to_create)
Document.objects.bulk_create(documents_to_create)
Household.objects.bulk_update(households_to_update, household_fields_to_update)
Individual.objects.bulk_update(individuals_to_update, individual_fields_to_update)
populate_index(
Individual.objects.filter(id__in=[individual.id for individual in individuals_to_update]),
get_individual_doc(self.business_area.slug),
)
populate_index(
Household.objects.filter(id__in=[household.id for household in households_to_update]), HouseholdDocument
)

def execute(self) -> None:
workbook = load_workbook(filename=self.file_path)
sheet = workbook.active
Expand Down
Binary file modified tests/unit/one_time_scripts/files/update_script_sudan.xlsx
Binary file not shown.
61 changes: 53 additions & 8 deletions tests/unit/one_time_scripts/test_south_sudan_update_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,30 @@ def setUpTestData(cls) -> None:
individual.unicef_id = "IND-0"
individual.save()
individual.refresh_from_db()

household2, individuals2 = create_household_and_individuals(
household_data={
"business_area": business_area,
"program_id": program.id,
},
individuals_data=[
{
"business_area": business_area,
"program_id": program.id,
},
],
)
individual = individuals[0]
individual.unicef_id = "IND-0"
individual.save()
individual.refresh_from_db()
cls.individual = individual

individual2 = individuals2[0]
individual2.unicef_id = "IND-1"
individual2.save()
individual2.refresh_from_db()
cls.individual2 = individual2
rebuild_search_index()

def test_south_sudan_update_script(self) -> None:
Expand All @@ -77,13 +100,33 @@ def test_south_sudan_update_script(self) -> None:
country=poland,
rdi_merge_status="MERGED",
)

with Capturing():
Document.objects.create(
individual=self.individual,
type=DocumentType.objects.get(key="birth_certificate"),
document_number="OLD",
country=germany,
rdi_merge_status="MERGED",
)
with Capturing() as output:
south_sudan_update_script(
f"{settings.TESTS_ROOT}/one_time_scripts/files/update_script_sudan.xlsx", self.program.id
f"{settings.TESTS_ROOT}/one_time_scripts/files/update_script_sudan.xlsx", self.program.id, 1
)
expected_output = [
"Validating row 0 to 1 Indivduals",
"Validating row 1 to 2 Indivduals",
"Validation successful",
"Updating row 0 to 1 Individuals",
"Updating row 1 to 2 Individuals",
"Deduplicating individuals Elasticsearch",
"Deduplicating documents",
"Update successful",
]

self.assertEqual(output, expected_output)
self.individual.refresh_from_db()
self.individual2.refresh_from_db()
individual = self.individual
individual2 = self.individual2
household = individual.household
self.assertEqual(household.admin1.p_code, "AF11")
self.assertEqual(household.admin2.p_code, "AF1115")
Expand All @@ -102,22 +145,24 @@ def test_south_sudan_update_script(self) -> None:
self.assertEqual(individual.flex_fields.get("ss_hw_cadre_i_f"), "aaaaa")
self.assertEqual(individual.documents.get(type__key="national_id").document_number, "TEST123")
self.assertEqual(individual.documents.get(type__key="national_id").country.iso_code3, "POL")
self.assertEqual(individual.documents.get(type__key="birth_certificate").document_number, "TEST456")

self.assertEqual(individual.documents.get(type__key="birth_certificate").document_number, "OLD")
self.assertEqual(individual.documents.get(type__key="birth_certificate").country.iso_code3, "DEU")
self.assertEqual(individual2.middle_name, "Testowy")
self.assertEqual(individual2.family_name, "Tesciak")

def test_south_sudan_update_script_validation_fails(self) -> None:
with Capturing() as output:
south_sudan_update_script(
f"{settings.TESTS_ROOT}/one_time_scripts/files/update_script_sudan.xlsx", self.program.id
f"{settings.TESTS_ROOT}/one_time_scripts/files/update_script_sudan.xlsx", self.program.id, 1
)
expected_output = [
"Validating row 0 to 100 Indivduals",
"Validating row 0 to 1 Indivduals",
"Validating row 1 to 2 Indivduals",
"Validation failed",
"Row: 2 - Administrative area admin1 with p_code AF11 not found",
"Row: 2 - Administrative area admin2 with p_code AF1115 not found",
"Row: 2 - Country not found for field national_id_country_i_c and value Poland",
"Row: 2 - Document type not found for field national_id_no_i_c",
"Row: 2 - Country not found for field birth_certificate_country_i_c and value Germany",
"Row: 2 - Document type not found for field birth_certificate_no_i_c",
]
self.assertEqual(output, expected_output)

0 comments on commit b185185

Please sign in to comment.