#!/usr/bin/env python3 from blackfynn import Blackfynn from pymongo import MongoClient from app.config import Config bf = None embargoed = None def connect_blackfynn(): "Connect to Blackfynn" print("Connecting to Blackfynn ...", end=" ") bf = Blackfynn( api_token=Config.BLACKFYNN_API_TOKEN, api_secret=Config.BLACKFYNN_API_SECRET, env_override=False, host=Config.BLACKFYNN_API_HOST, ) print("done") return bf def connect_mongo(): ### 'Connect to MongoDB' print("Connecting to MongoDB ...", end=" ") client = MongoClient(Config.MONGODB_URI) db = client[Config.MONGODB_NAME] collection = db[Config.MONGODB_COLLECTION] print("done") return collection def transform(ds): "Convert dataset JSON from Blackfynn to a database entry" content = ds["content"] # models = bf.get_dataset(content['id']).models() doc = { "_id": content["intId"], "name": content["name"], "description": content.get("description"), "createdAt": content["createdAt"], "updatedAt": content["updatedAt"], "tags": content["tags"], "contributors": content["contributors"], "organization": ds["organization"], "userId": ds["owner"], "size": ds["storage"], "organization": org["organization"]["name"], "organizationId": org["organization"]["intId"], # 'modelCount': {name: m.count for name,m in models.items()}, # 'fileCount': Counter(), #'recordCount': sum(m.count for m in models.values()), "banner": api._get(api._uri("/datasets/{dsid}/banner", dsid=content["id"])).get( "banner", None ), } # Get file counts: # cursor = '' # while True: # resp = api._get(api._uri('/datasets/{dsid}/packages?cursor={cursor}', dsid=content['id'], cursor=cursor)) # for p in resp['packages']: # ptype = p['content']['packageType'] # doc['fileCount'][ptype] += 1 # cursor = resp.get('cursor') # if cursor is None: # break return doc if __name__ == "__main__": print("Starting embargoed data sync") bf = connect_blackfynn() api = bf._api.core embargoed = connect_mongo() org = api._get(api._uri("/organizations/{orgid}", orgid=bf.context.id)) all_datasets = api._get( "/datasets" ) # assuming all of these are part of SPARC Consortium org. publishedIds = [x["sourceDatasetId"] for x in api._get("/datasets/published")] print(embargoed) ### Delete all existing entries: embargoed.drop() entries = [] for ds in all_datasets: print(ds) # skip if dataset is published or isn't part of Embargoed Data Team: print( api._get( api._uri( "/datasets/{dsid}/collaborators/teams", dsid=ds["content"]["id"] ) ) ) if ds["content"]["intId"] in publishedIds or not any( t["id"] == Config.PENNSIEVE_EMBARGO_TEAM_ID for t in api._get( api._uri( "/datasets/{dsid}/collaborators/teams", dsid=ds["content"]["id"] ) ) ): print("skipping") continue entry = transform(ds) entries.append(entry) print("Found:", entry["_id"], entry["name"], sep="\t") if entries: embargoed.insert_many(entries) print("Sync finished.")