-
Notifications
You must be signed in to change notification settings - Fork 0
/
Introduction_to_MongoDB_in_Python.py
555 lines (389 loc) · 15.8 KB
/
Introduction_to_MongoDB_in_Python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
############################
# Flexibly Structured Data #
############################
### Listing databases and collections
# Save a list of names of the databases managed by client
db_names = client.list_database_names()
print(db_names)
# Save a list of names of the collections managed by the "nobel" database
nobel_coll_names = client.nobel.list_collection_names()
print(nobel_coll_names)
### List fields of a document
# Connect to the "nobel" database
db = client.nobel
# Retrieve sample prize and laureate documents
prize = db.prizes.find_one()
laureate = db.laureates.find_one()
# Print the sample prize and laureate documents
print(prize)
print(laureate)
print(type(laureate))
# Get the fields present in each type of document
prize_fields = list(prize.keys())
laureate_fields = list(laureate.keys())
print(prize_fields)
print(laureate_fields)
### Composing filters
# 1
# Create a filter for laureates who died in the USA
criteria = {"diedCountry": "USA"}
# Save the count of these laureates
count = db.laureates.count_documents(criteria)
print(count)
# 2
# Create a filter for laureates who died in the USA but were born in Germany
criteria = {"diedCountry": "USA",
"bornCountry": "Germany"}
# Save the count
count = db.laureates.count_documents(criteria)
print(count)
# 3
# Create a filter for Germany-born laureates who died in the USA and with the first name "Albert"
criteria = {"diedCountry": "USA",
"bornCountry": "Germany",
"firstname": "Albert"}
# Save the count
count = db.laureates.count_documents(criteria)
print(count)
### We've got options
# 1
# Save a filter for laureates born in the USA, Canada, or Mexico
criteria = { "bornCountry":
{ "$in": ["USA", "Canada", "Mexico"]}
}
# Count them and save the count
count = db.laureates.count_documents(criteria)
print(count)
# 2
# Save a filter for laureates who died in the USA and were not born there
criteria = { "diedCountry": "USA",
"bornCountry": { "$ne": "USA"},
}
# Count them
count = db.laureates.count_documents(criteria)
print(count)
### Starting our ascent
# Filter for laureates born in Austria with non-Austria prize affiliation
criteria = {"bornCountry": "Austria",
"prizes.affiliations.country": {"$ne": "Austria"}}
# Count the number of such laureates
count = db.laureates.count_documents(criteria)
print(count)
### Our 'born' approximation, and a special laureate
# 1
# Filter for documents without a "born" field
criteria = {"born": {"$exists": False}}
# Save count
count = db.laureates.count_documents(criteria)
print(count)
# 2
# Filter for laureates with at least three prizes
criteria = {"prizes.2": {"$exists": True}}
# Find one laureate with at least three prizes
doc = db.laureates.find_one(criteria)
# Print the document
print(doc)
#########################################
# Working with Distinct Values and Sets #
#########################################
### Never from there, but sometimes there at last
# Countries recorded as countries of death but not as countries of birth
countries = set(db.laureates.distinct("diedCountry")) - set(db.laureates.distinct("bornCountry"))
print(countries)
### Countries of affiliation
# The number of distinct countries of laureate affiliation for prizes
count = len(db.laureates.distinct("prizes.affiliations.country"))
print(count)
### Triple plays (mostly) all around
# Save a filter for prize documents with three or more laureates
criteria = {"laureates.2": {"$exists": True}}
# Save the set of distinct prize categories in documents satisfying the criteria
triple_play_categories = set(db.prizes.distinct("category", criteria))
# Confirm literature as the only category not satisfying the criteria.
assert set(db.prizes.distinct("category")) - triple_play_categories == {"literature"}
### Meanwhile, in other categories...
# Save a filter for laureates with unshared prizes
unshared = {
"prizes": {"$elemMatch": {
"category": {"$nin": ["physics", "chemistry", "medicine"]},
"share": "1",
"year": {"$gte": "1945"},
}}}
# Save a filter for laureates with shared prizes
shared = {
"prizes": {"$elemMatch": {
"category": {"$nin": ["physics", "chemistry", "medicine"]},
"share": {"$ne": "1"},
"year": {"$gte": "1945"},
}}}
ratio = db.laureates.count_documents(unshared) / db.laureates.count_documents(shared)
print(ratio)
### Organizations and prizes over time
# Save a filter for organization laureates with prizes won before 1945
before = {
"gender": "org",
"prizes.year": {"$lt": "1945"},
}
# Save a filter for organization laureates with prizes won in or after 1945
in_or_after = {
"gender": "org",
"prizes.year": {"$gte": "1945"},
}
n_before = db.laureates.count_documents(before)
n_in_or_after = db.laureates.count_documents(in_or_after)
ratio = n_in_or_after / (n_in_or_after + n_before)
print(ratio)
### Germany, then and now
# 1
from bson.regex import Regex
# Filter for laureates with "Germany" in their "bornCountry" value
criteria = {"bornCountry": Regex("Germany",0)}
print(set(db.laureates.distinct("bornCountry", criteria)))
#2
from bson.regex import Regex
# Filter for laureates with a "bornCountry" value starting with "Germany"
criteria = {"bornCountry": Regex("^Germany")}
print(set(db.laureates.distinct("bornCountry", criteria)))
# 3
from bson.regex import Regex
# Fill in a string value to be sandwiched between the strings "^Germany " and "now"
criteria = {"bornCountry": Regex("^Germany " + "\\(" + "now")}
print(set(db.laureates.distinct("bornCountry", criteria)))
# 4
from bson.regex import Regex
#Filter for currently-Germany countries of birth. Fill in a string value to be sandwiched between the strings "now" and "$"
criteria = {"bornCountry": Regex("now" + " Germany\\)" + "$")}
print(set(db.laureates.distinct("bornCountry", criteria)))
### The prized transistor
from bson.regex import Regex
# Save a filter for laureates with prize motivation values containing "transistor" as a substring
criteria = {"prizes.motivation": Regex("transistor")}
# Save the field names corresponding to a laureate's first name and last name
first, last = "firstname", "surname"
print([(laureate[first], laureate[last]) for laureate in db.laureates.find(criteria)])
####################################
# Get Only What You Need, and Fast #
####################################
### Rounding up the G.S. crew
# Use projection to select only firstname and surname
docs = db.laureates.find(
filter= {"firstname" : {"$regex" : "^G"},
"surname" : {"$regex" : "^S"} },
projection= ["firstname", "surname"] )
# Iterate over docs and concatenate first name and surname
full_names = [doc["firstname"] + " " + doc["surname"] for doc in docs]
# Print the full names
print(full_names)
### Doing our share of data validation
# Save documents, projecting out laureates share
prizes = db.prizes.find({}, ["laureates.share"])
# Iterate over prizes
for prize in prizes:
# Initialize total share
total_share = 0
# Iterate over laureates for the prize
for laureate in prize["laureates"]:
# add the share of the laureate to total_share
total_share += 1 / float(laureate["share"])
# Print the total share
print(total_share)
### Sorting together: MongoDB + Python
from operator import itemgetter
def all_laureates(prize):
# sort the laureates by surname
sorted_laureates = sorted(prize["laureates"], key=itemgetter("surname"))
# extract surnames
surnames = [laureate["surname"] for laureate in sorted_laureates]
# concatenate surnames separated with " and "
all_names = " and ".join(surnames)
return all_names
# find physics prizes, project year and name, and sort by year
docs = db.prizes.find(
filter= {"category": "physics"},
projection= ["year", "laureates.firstname", "laureates.surname"],
sort= [("year", 1)])
# print the year and laureate names (from all_laureates)
for doc in docs:
print("{year}: {names}".format(year=doc["year"], names=all_laureates(doc)))
### Gap years
# original categories from 1901
original_categories = db.prizes.distinct("category", {"year": "1901"})
print(original_categories)
# project year and category, and sort
docs = db.prizes.find(
filter={},
projection = {"year":1, "category":1, "_id":0},
sort=[("year", -1), ("category", 1)]
)
#print the documents
for doc in docs:
print(doc)
### Recently single?
# Specify an index model for compound sorting
index_model = [("category", 1), ("year", -1)]
db.prizes.create_index(index_model)
# Collect the last single-laureate year for each category
report = ""
for category in sorted(db.prizes.distinct("category")):
doc = db.prizes.find_one(
{"category": category, "laureates.share": "1"},
sort=[("year", -1)]
)
report += "{category}: {year}\n".format(**doc)
print(report)
### Born and affiliated
from collections import Counter
# Ensure an index on country of birth
db.laureates.create_index([("bornCountry", 1)])
# Collect a count of laureates for each country of birth
n_born_and_affiliated = {
country: db.laureates.count_documents({
"bornCountry": country,
"prizes.affiliations.country": country
})
for country in db.laureates.distinct("bornCountry")
}
five_most_common = Counter(n_born_and_affiliated).most_common(5)
print(five_most_common)
### The first five prizes with quarter shares
from pprint import pprint
# Fetch prizes with quarter-share laureate(s)
filter_ = {"laureates.share": "4"}
# Save the list of field names
projection = ["category", "year", "laureates.motivation"]
# Save a cursor to yield the first five prizes
cursor = db.prizes.find(filter_, projection).sort("year").limit(5)
pprint(list(cursor))
### Pages of particle-prized people
from pprint import pprint
# Write a function to retrieve a page of data
def get_particle_laureates(page_number=1, page_size=3):
if page_number < 1 or not isinstance(page_number, int):
raise ValueError("Pages are natural numbers (starting from 1).")
particle_laureates = list(
db.laureates.find(
{"prizes.motivation": {"$regex": "particle"}},
["firstname", "surname", "prizes"])
.sort([("prizes.year", 1), ("surname", 1)])
.skip(page_size * (page_number - 1))
.limit(page_size))
return particle_laureates
# Collect and save the first nine pages
pages = [get_particle_laureates(page_number=page) for page in range(1,9)]
pprint(pages[0])
#######################################################
# Aggregation Pipelines: Let the Server Do It For You #
#######################################################
### Aggregating a few individuals' country data
# Translate cursor to aggregation pipeline
pipeline = [
{"$match": {"gender": {"$ne": "org"}}},
{"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
{"$limit": 3}
]
for doc in db.laureates.aggregate(pipeline):
print("{bornCountry}: {prizes}".format(**doc))
### Passing the aggregation baton to Python
from collections import OrderedDict
from itertools import groupby
from operator import itemgetter
original_categories = set(db.prizes.distinct("category", {"year": "1901"}))
# Save an pipeline to collect original-category prizes
pipeline = [
{"$match": {"category": {"$in": list(original_categories)}}},
{"$project": {"year": 1, "category": 1}},
{"$sort": OrderedDict([("year", -1)])}
]
cursor = db.prizes.aggregate(pipeline)
for key, group in groupby(cursor, key=itemgetter("year")):
missing = original_categories - {doc["category"] for doc in group}
if missing:
print("{year}: {missing}".format(year=key, missing=", ".join(sorted(missing))))
### Organizing prizes
# Count prizes awarded (at least partly) to organizations as a sum over sizes of "prizes" arrays.
pipeline = [
{"$match": {"gender": "org"}},
{"$project": {"n_prizes": {"$size": "$prizes"}}},
{"$group": {"_id": None, "n_prizes_total": {"$sum": {"_id": None}}}}
]
print(list(db.laureates.aggregate(pipeline)))
### Gap years, aggregated
from collections import OrderedDict
original_categories = sorted(set(db.prizes.distinct("category", {"year": "1901"})))
pipeline = [
{"$match": {"category": {"$in": original_categories}}},
{"$project": {"category": 1, "year": 1}},
# Collect the set of category values for each prize year.
{"$group": {"_id": "$year", "categories": {"$addToSet": "$category"}}},
# Project categories *not* awarded (i.e., that are missing this year).
{"$project": {"missing": {"$setDifference": [original_categories, "$categories"]}}},
# Only include years with at least one missing category
{"$match": {"missing.0": {"$exists": 1}}},
# Sort in reverse chronological order. Note that "_id" is a distinct year at this stage.
{"$sort": OrderedDict([("_id", -1)])},
]
for doc in db.prizes.aggregate(pipeline):
print("{year}: {missing}".format(year=doc["_id"],missing=", ".join(sorted(doc["missing"]))))
### Here and elsewhere
key_ac = "prizes.affiliations.country"
key_bc = "bornCountry"
pipeline = [
{"$project": {key_bc: 1, key_ac: 1}},
# Ensure a single prize affiliation country per pipeline document
{"$unwind": "$prizes"},
{"$unwind": "$prizes.affiliations"},
# Ensure values in the list of distinct values (so not empty)
{"$match": {key_ac: {"$in": db.laureates.distinct(key_ac)}}},
{"$project": {"affilCountrySameAsBorn": {
"$gte": [{"$indexOfBytes": ["$"+key_ac, "$"+key_bc]}, 0]}}},
# Count by "$affilCountrySameAsBorn" value (True or False)
{"$group": {"_id": "$affilCountrySameAsBorn",
"count": {"$sum": 1}}},
]
for doc in db.laureates.aggregate(pipeline): print(doc)
### Countries of birth by prize category
pipeline = [
# Unwind the laureates array
{"$unwind": "$laureates"},
{"$lookup": {
"from": "laureates", "foreignField": "id",
"localField": "laureates.id", "as": "laureate_bios"}},
# Unwind the new laureate_bios array
{"$unwind": "$laureate_bios"},
{"$project": {"category": 1,
"bornCountry": "$laureate_bios.bornCountry"}},
# Collect bornCountry values associated with each prize category
{"$group": {"_id": "$category",
"bornCountries": {"$addToSet": "$bornCountry"}}},
# Project out the size of each category's (set of) bornCountries
{"$project": {"category": 1,
"nBornCountries": {"$size": "$bornCountries"}}},
{"$sort": {"nBornCountries": -1}},
]
for doc in db.prizes.aggregate(pipeline): print(doc)
### How many prizes were awarded to immigrants?
pipeline = [
# Limit results to people; project needed fields; unwind prizes
{"$match": {"gender": {"$ne": "org"}}},
{"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
{"$unwind": "$prizes"},
# Count prizes with no country-of-birth affiliation
{"$addFields": {"bornCountryInAffiliations": {"$in": ["$bornCountry", "$prizes.affiliations.country"]}}},
{"$match": {"bornCountryInAffiliations": False}},
{"$count": "awardedElsewhere"},
]
print(list(db.laureates.aggregate(pipeline)))
### Refinement: filter out "unaffiliated" people
pipeline = [
{"$match": {"gender": {"$ne": "org"}}},
{"$project": {"bornCountry": 1, "prizes.affiliations.country": 1}},
{"$unwind": "$prizes"},
{"$addFields": {"bornCountryInAffiliations": {"$in": ["$bornCountry", "$prizes.affiliations.country"]}}},
{"$match": {"bornCountryInAffiliations": False}},
{"$count": "awardedElsewhere"},
]
# Construct the additional filter stage
added_stage = {"$match": {"prizes.affiliations.country": {"$in": db.laureates.distinct("prizes.affiliations.country")}}}
# Insert this stage into the pipeline
pipeline.insert(3, added_stage)
print(list(db.laureates.aggregate(pipeline)))