From 1015db663f395d948c71b94d7ce9a6aa07c55cce Mon Sep 17 00:00:00 2001
From: Tom McKeesick <tmck01@gmail.com>
Date: Tue, 19 Sep 2023 07:32:54 +1000
Subject: [PATCH 1/2] create article for python BOMs

---
 README.md                                                    | 5 +++++
 .../20230919_parsing_boms_in_python.md                       | 1 +
 2 files changed, 6 insertions(+)
 create mode 100644 articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md

diff --git a/README.md b/README.md
index 919157b..046f8ff 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,10 @@ my blog
 
 ---
 
+### [20230919 Parsing BOMs in Python](articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md)
+
+> _How to detect/read/write UTF 8/16 BOMs_
+
 ### [20230704 Jupyter Cell Wrappers](articles/20230704_jupyter_cell_wrappers/20230704_jupyter_cell_wrappers.md)
 
 > _Adding decorator-style functionality to jupyter cells_
@@ -69,3 +73,4 @@ my blog
 
 
 
+
diff --git a/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md b/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md
new file mode 100644
index 0000000..ff5dbaf
--- /dev/null
+++ b/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md
@@ -0,0 +1 @@
+# 20230919 Parsing BOMs in Python

From 7a8006838e20028b42bbdcf43334663de71082d2 Mon Sep 17 00:00:00 2001
From: Tom McKeesick <tmck01@gmail.com>
Date: Tue, 19 Sep 2023 07:48:01 +1000
Subject: [PATCH 2/2] add code

---
 .../20230919_parsing_boms_in_python.md        | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md b/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md
index ff5dbaf..85678f3 100644
--- a/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md
+++ b/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md
@@ -1 +1,34 @@
 # 20230919 Parsing BOMs in Python
+
+```python
+ import csv, codecs
+
+ CODECS = {
+     "utf-8-sig": [codecs.BOM_UTF8],
+     "utf-16": [
+        codecs.BOM_UTF16,
+        codecs.BOM_UTF16_BE,
+        codecs.BOM_UTF16_LE,
+    ]
+ }
+
+ def detect_encoding(fpath):
+     with open(fpath, 'rb') as istream:
+         data = istream.read(3)
+         for encoding, boms in CODECS.items():
+             if any(data.startswith(bom) for bom in boms):
+                 return encoding
+     return 'utf-8'
+
+ def read(fpath):
+     with open(fpath, 'r', encoding=detect_encoding(fpath)) as istream:
+         yield from csv.DictReader(istream)
+```
+
+```python
+ # run here
+ for i, row in enumerate(read('test.csv')):
+     print(i, row)
+     if i > 10:
+         break
+```