diff --git a/README.md b/README.md index 0b6a40c..d453a1f 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,10 @@ my blog > _The configs that I use for CS2 surf, currently a WIP_ +### [20230919 Parsing BOMs in Python](articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md) + +> _How to detect/read/write UTF 8/16 BOMs_ + ### [20230704 Jupyter Cell Wrappers](articles/20230704_jupyter_cell_wrappers/20230704_jupyter_cell_wrappers.md) > _Adding decorator-style functionality to jupyter cells_ @@ -81,3 +85,4 @@ my blog + diff --git a/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md b/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md new file mode 100644 index 0000000..85678f3 --- /dev/null +++ b/articles/20230919_parsing_boms_in_python/20230919_parsing_boms_in_python.md @@ -0,0 +1,34 @@ +# 20230919 Parsing BOMs in Python + +```python + import csv, codecs + + CODECS = { + "utf-8-sig": [codecs.BOM_UTF8], + "utf-16": [ + codecs.BOM_UTF16, + codecs.BOM_UTF16_BE, + codecs.BOM_UTF16_LE, + ] + } + + def detect_encoding(fpath): + with open(fpath, 'rb') as istream: + data = istream.read(3) + for encoding, boms in CODECS.items(): + if any(data.startswith(bom) for bom in boms): + return encoding + return 'utf-8' + + def read(fpath): + with open(fpath, 'r', encoding=detect_encoding(fpath)) as istream: + yield from csv.DictReader(istream) +``` + +```python + # run here + for i, row in enumerate(read('test.csv')): + print(i, row) + if i > 10: + break +```