-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
82 lines (63 loc) · 2.28 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#author @t_sanf
import os
import pydicom
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict
import docx
import win32com.client
from io import StringIO
import re
def rescale_array(array):
'''rescales the array from 0-250'''
scaler =MinMaxScaler(feature_range=(0 ,250))
scaler =scaler.fit(array)
X_scaled =scaler.transform(array)
return (X_scaled)
def order_dicom(dicom_dir):
'''
sorts the list of filenames based on z position of dicom files
:param dicom_dir full path to directory with dicom files
:return list of files in correct order
'''
dicoms={}
for path in [os.path.join(dicom_dir,file) for file in os.listdir(dicom_dir) if file!='VERSION']:
ds=pydicom.read_file(path)
dicoms[path] = float(ds.SliceLocation)
updated_imagelist=[key for (key, value) in sorted(dicoms.items(), key=lambda x: x[1])]
return(updated_imagelist)
#########################
# function to manipulate word documents
def getText_without_first_line(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs[1:]: #starting on 9th line because dates live in the header
fullText.append(para.text)
return '\n'.join(fullText)
def save_doc_as_docx(doc_path,docx_path):
'''
hackery to deal with stoopd .doc file. First convert to .docx
:param doc_path:
:param docx_path:
:return:
'''
word = win32com.client.gencache.EnsureDispatch('Word.Application')
word.Visible = False
wb = word.Documents.Open(doc_path)
doc = word.ActiveDocument
doc.SaveAs2(docx_path, FileFormat=16)
doc.Close()
######################
#regular expression methods
def remove_dates(path_to_txt_file):
'''uses regular expression to remove all the dates from a text file'''
f = open(path_to_txt_file, "r")
lines = f.readlines()
f.close()
f = open(path_to_txt_file, "w")
for line in lines:
line=re.sub('[0-9]{1,2}[\/][0-9]{1,2}[\/][0-9]{2,4}','REDACTED', line)
line = re.sub('Baris Turkbey', 'REDACTED', line)
line = re.sub('Choyke, Peter L.', 'REDATCTED', line)
line = re.sub('15-C-0124', 'REDACTED', line)
f.write(line)
f.close