-
Notifications
You must be signed in to change notification settings - Fork 1
/
ecod_neo4j.txt
219 lines (157 loc) · 6.75 KB
/
ecod_neo4j.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#IMPORT ECOD domains
CREATE INDEX ON :Domain(domainID)
CREATE CONSTRAINT ON (d:Domain) ASSERT d.uid IS UNIQUE
CREATE CONSTRAINT ON (g:XGroup) ASSERT g.type IS UNIQUE
CREATE CONSTRAINT ON (g:HGroup) ASSERT g.type IS UNIQUE
CREATE CONSTRAINT ON (g:TGroup) ASSERT g.type IS UNIQUE
CREATE CONSTRAINT ON (g:FGroup) ASSERT g.type IS UNIQUE
CREATE CONSTRAINT ON (p:PDBEntry) ASSERT p.id IS UNIQUE
CREATE CONSTRAINT ON (lig:Ligand) ASSERT lig.type IS UNIQUE
CREATE CONSTRAINT ON (pdbchain:PDBChain) ASSERT pdbchain.id IS UNIQUE
CREATE CONSTRAINT ON (arch:ArchGroup) ASSERT arch.type IS UNIQUE
CREATE CONSTRAINT ON (u:UniprotEntry) ASSERT u.accession IS UNIQUE
USING PERIODIC COMMIT 100
LOAD CSV WITH HEADERS FROM "file:///home/lab/Downloads/ecod.develop86.domains.txt"
AS line FIELDTERMINATOR '\t'
WITH split(line.F_group, ',') AS FGROUPS,
[
pdbchain IN (split(line.PDB_residue_range, ','))|
[
line.PDB_id+"."+toString(split(pdbchain, ':')[0]),
[res IN split(split(pdbchain, ':')[1], '-')| res]
]
]
AS PDBCHAINRES, line
MERGE (d:Domain {uid: toInt(line.Uid), domainID: line.Domain_id, status: line.Representative_status, assemblyStatus: line.Assembly_status})
FOREACH (l IN (CASE WHEN line.Ligands = 'NO_LIGANDS_4A' THEN [] WHEN line.Ligands = 'NA' THEN [] ELSE split(line.Ligands, ',') END)|
MERGE (lig:Ligand {type: l})
CREATE UNIQUE (d)-[:BINDS]->(lig)
)
MERGE (t:TGroup:Architecture {type: line.T_group})
MERGE (h:HGroup:Architecture {type: CASE WHEN line.H_group = 'NO_H_NAME' THEN "H grouping of "+t.type ELSE line.H_group END})
MERGE (x:XGroup:Architecture {type: CASE WHEN line.X_group = 'NO_X_NAME' THEN "X grouping of "+h.type ELSE line.X_group END})
MERGE (arch:ArchGroup:Architecture {type: line.Architecture})
FOREACH (fgroup IN FGROUPS|
MERGE (f:FGroup {type: fgroup})
CREATE UNIQUE (d)-[:BELONGS {type:'FGroup'}]->(f)
CREATE UNIQUE (f)-[:BELONGS {type: 'TGroup'}]->(t)
)
CREATE UNIQUE (t)-[:BELONGS {type: 'HGroup'}]->(h)
CREATE UNIQUE (h)-[:BELONGS {type: 'XGroup'}]->(x)
CREATE UNIQUE (x)-[:BELONGS {type: 'ArchGroup'}]->(arch)
MERGE (pdbentry:PDBEntry {id: line.PDB_id})
FOREACH ( sub IN PDBCHAINRES|
MERGE (pdbchain:PDBChain {id: sub[0]})
CREATE UNIQUE (pdbchain)-[:SUBCHAIN]->(pdbentry)
);
#Map domains to pdbchains and uniprot
USING PERIODIC COMMIT 100
LOAD CSV WITH HEADERS FROM "file:///home/lab/Downloads/ecod.develop84a.uniprot_map.txt"
AS line FIELDTERMINATOR '\t'
WITH
[
pdbchain IN (split(line.pdb_range, ','))|
[
substring(line.ecod_domain_id,1,4)+"."+toString(split(pdbchain, ':')[0]),
CASE left(split(pdbchain, ':')[1])
WHEN '-' THEN
res_str = substring(split(pdbchain, ':')[1],1)
[res IN res_str.split('-') | toInt(res)] AS tmp_res
temp_res[0] = -1 * temp_res[0]
ELSE
[res IN split(split(pdbchain, ':')[1], '-')| toInt(res)]
END
]
] AS PDBCHAINRES,
[
unpres IN (split(line.unp_range, ','))|
[res IN split(unpres,'-')| toInt(res)]
] AS UNPRES,
line
UNWIND PDBCHAINRES AS chainres
MATCH (d:Domain {uid: toInt(line.uid)}), (c:PDBChain {id: chainres[0]})
CREATE (d)-[:MATCHES {res: chainres[1]}]->(c)
UNWIND PDBCHAINRES AS chainres
MATCH (c:PDBChain {id: chainres[0]}), (u:UniprotEntry {accession: line.unp_acc})
UNWIND UNPRES AS unpres
CREATE (c)-[:MATCHES {res: unpres}]->(u);
MATCH (d:Domain {uid: toInt(line.uid)}),(c:PDBChain {id: PDBCHAINRES[0]})
SET r.res = PDBCHAINRES[1]
MERGE (u:UniprotEntry {accession: line.unp_acc})
CREATE (c)-[:MATCHES {res: split(line.unp_range,'-')}]->(u);
#FIX residue mapping
MATCH (d:Domain)-[r:MATCHES]-(c:PDBChain) WHERE 3=LENGTH(r.res) SET r.res=[-1*toint(r.res[1]),toint(r.res[2])] RETURN r;
#Add group type label to groups
USING PERIODIC COMMIT 100
LOAD CSV WITH HEADERS FROM 'file:///home/lab/Downloads/ecod.ready.latest.domains_part1.txt'
AS line FIELDTERMINATOR '\t'
WITH split(line.Fgroup, ',') AS FGROUPS, line
FOREACH (fgroup IN FGROUPS|
MATCH (f:Group {type: fgroup})
SET f:FGroup
)
MATCH (x:Group {type: line.Xgroup})
MATCH (h:Group {type: line.Hgroup})
MATCH (t:Group {type: line.Tgroup})
SET x:XGroup
SET h:HGroup
SET t:TGroup;
#FIX ARCH Hierachy
MATCH (f:FGroup)-[b1:BELONGS]-(d:Domain)-[b2:BELONGS]-(t:TGroup) CREATE UNIQUE (f)-[:BELONGS {type: "TGroup"}]->(t) DELETE b2;
#Import pdb chain to uniprot mapping from pdbsws
USING PERIODIC COMMIT 100
LOAD CSV WITH HEADERS FROM 'file:///home/lab/Downloads/pdb_uniprot_chain_map.lst.2'
AS line FIELDTERMINATOR ','
WITH line
MATCH (c:PDBChain {id: line.pdb+"."+line.chain})
MERGE (u:Uniprot {accession: line.uniprot})
CREATE (c)-[:MAP]->(u);
#Import pdbchain to uniprot mapping from ebi SIFTS
USING PERIODIC COMMIT 100
LOAD CSV WITH HEADERS FROM 'file:///home/lab/Downloads/pdb_chain_uniprot.csv'
AS line FIELDTERMINATOR ','
WITH line
MATCH (c:PDBChain {id: line.PDB+"."+line.CHAIN})
MERGE (u:Uniprot {accession: line.SP_PRIMARY})
CREATE (c)-[:MAP]->(u);
//RETURN [pdbchain IN (split(line.pdbRES, ','))|[subchain IN (split(pdbchain, ':'))|subchain]] AS result
//WITH split(line.pdbRES, ',') AS PDBRES
//WITH [pdbchain IN PDBRES| split(pdbchain, ':')] AS temp
//RETURN [tmp IN temp| split(tmp[1], '-')] AS result
//MERGE (d:Domain {uid: toInt(line.uid), domainID: line.domainID, status: line.status, chainID: line.chainID, pdbRES: PDBRES, assemblyStatus: line.assemblyStatus})
USING PERIODIC COMMIT 100 LOAD CSV FROM '/home/lab/Downloads/ecod.ready.latest.domains.txt' AS line FIELDTERMINATOR '\t' CREATE (d:domain {uid: toInt(line.uid), domainID: line.domainID, status: line.status, hierachyID: line.hierachyID, pdbID: line.pdbID, chainID: line.chainID, assemblyStatus: line.assemblyStatus});
LOAD CSV FROM '/home/lab/Downloads/ecod.ready.latest.domains.txt' AS line FIELDTERMINATOR '\t'
AS line
WITH line LIMIT 4
RETURN line
#Get Single Path to Top Level Arch.
MATCH (d:Domain)-[:BINDS]-(l:Ligand {type: "CA"}),(a:ArchGroup)
WITH d, COLLECT(a) as archs
UNWIND archs AS arch
MATCH p=shortestPath((d)-[:BELONGS*]-(arch)) WITH p ORDER BY length(p)
RETURN COLLECT(p)[0] AS path;
#Find Parent
MATCH (d:Domain)-[:BINDS]-(l:Ligand {type: "CA"}),(d)-[:BELONGS*]->(a) WHERE not(a-[:BELONGS]->()) RETURN DISTINCT a;
LOAD CSV WITH HEADERS FROM "file:///home/lab/Downloads/ecod.develop84a.uniprot_map.txt"
AS line FIELDTERMINATOR '\t'
WITH
[
pdbchain IN (split(line.pdb_range, ','))|
[
substring(line.ecod_domain_id,1,4)+"."+toString(split(pdbchain, ':')[0]),
CASE left(split(pdbchain, ':')[1],1)
WHEN '-' THEN
extract(res IN split(substring(split(pdbchain, ':')[1],1), '-')| toInt(res)) AS temp_res
temp_res[0] = -1*temp_res[0]
ELSE
[res IN split(split(pdbchain, ':')[1], '-')| toInt(res)]
END
]
] AS PDBCHAINRES,
[
unpres IN (split(line.unp_range, ','))|
[res IN split(unpres,'-')| toInt(res)]
] AS UNPRES,
line LIMIT 25
UNWIND PDBCHAINRES AS chainres
RETURN chainres[0], chainres[1];