From 1015db663f395d948c71b94d7ce9a6aa07c55cce Mon Sep 17 00:00:00 2001 From: Tom McKeesick Date: Tue, 19 Sep 2023 07:32:54 +1000 Subject: [PATCH 1/2] create article for python BOMs --- README.md | 5 +++++ .../20230919_parsing_boms_in_python.md | 1 + 2 files changed, 6 insertions(+) create mode 100644 articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md diff --git a/README.md b/README.md index 919157b..046f8ff 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,10 @@ my blog --- +### [20230919 Parsing BOMs in Python](articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md) + +> _How to detect/read/write UTF 8/16 BOMs_ + ### [20230704 Jupyter Cell Wrappers](articles/20230704_jupyter_cell_wrappers/20230704_jupyter_cell_wrappers.md) > _Adding decorator-style functionality to jupyter cells_ @@ -69,3 +73,4 @@ my blog + diff --git a/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md b/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md new file mode 100644 index 0000000..ff5dbaf --- /dev/null +++ b/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md @@ -0,0 +1 @@ +# 20230919 Parsing BOMs in Python From 7a8006838e20028b42bbdcf43334663de71082d2 Mon Sep 17 00:00:00 2001 From: Tom McKeesick Date: Tue, 19 Sep 2023 07:48:01 +1000 Subject: [PATCH 2/2] add code --- .../20230919_parsing_boms_in_python.md | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md b/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md index ff5dbaf..85678f3 100644 --- a/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md +++ b/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md @@ -1 +1,34 @@ # 20230919 Parsing BOMs in Python + +```python + import csv, codecs + + CODECS = { + "utf-8-sig": [codecs.BOM_UTF8], + "utf-16": [ + codecs.BOM_UTF16, + codecs.BOM_UTF16_BE, + codecs.BOM_UTF16_LE, + ] + } + + def detect_encoding(fpath): + with open(fpath, 'rb') as istream: + data = istream.read(3) + for encoding, boms in CODECS.items(): + if any(data.startswith(bom) for bom in boms): + return encoding + return 'utf-8' + + def read(fpath): + with open(fpath, 'r', encoding=detect_encoding(fpath)) as istream: + yield from csv.DictReader(istream) +``` + +```python + # run here + for i, row in enumerate(read('test.csv')): + print(i, row) + if i > 10: + break +```