import json import pytest import re from packaging import version import requests import os from app import app from app.main import dataset_search from app.scicrunch_requests import create_query_string from app.config import Config from known_uberons import UBERONS_DICT from known_dois import has_doi_changed, warn_doi_changes @pytest.fixture def client(): # Spin up test flask app app.config['TESTING'] = True return app.test_client() def test_scicrunch_keys(client): r = client.get('/search/') assert r.status_code == 200 assert 'numberOfHits' in json.loads(r.data).keys() def test_scicrunch_versions_are_supported(): # Lines below are to allow the test to be run from the root dir or sparc-api/tests current_directory = os.path.dirname(os.path.abspath(__file__)) app_directory = os.path.join(current_directory, '..', 'app') # List the contents of the 'app' directory, to find which versions we have files for available_versions = os.listdir(app_directory) r = requests.get(f'{Config.SCI_CRUNCH_HOST}/_search?api_key={Config.KNOWLEDGEBASE_KEY}&q=""') results = r.json() hits = results['hits']['hits'] for i, hit in enumerate(hits): try: version = hit['_source']['item']['version']['keyword'][:-1] + 'X' except KeyError: # Try to get minimal information out from the datasets version = 'undefined' package_version = f'scicrunch_processing_v_{version.replace(".", "_")}.py' assert package_version in available_versions def check_doi_status(client, dataset_id, doi): r = client.get('/dataset_info/using_pennsieve_identifier', query_string={'identifier': dataset_id}) response = json.loads(r.data) result = response['result'][0] status = True if version.parse(result['version']) >= version.parse("1.1.4"): if has_doi_changed(result['doi'].replace('https://doi.org/', ''), doi): warn_doi_changes() status = False return status def test_scicrunch_dataset_doi(client): # Testing with dataset 55 identifier = "55" run_doi_test = check_doi_status(client, identifier, '10.26275/pzek-91wx') if run_doi_test: r = client.get('/scicrunch-dataset/DOI%3A10.26275%2Fpzek-91wx') dataset_version = json.loads(r.data)['hits']['hits'][0]['_source']['item']['version']['keyword'] if version.parse(dataset_version) >= version.parse("1.1.4"): assert json.loads(r.data)['hits']['hits'][0]['_id'] == "55" assert json.loads(r.data)['hits']['hits'][0]['_source']['item']['curie'] == "DOI:10.26275/pzek-91wx" else: assert json.loads(r.data)['hits']['hits'][0]['_id'] == "DOI:10.26275/pzek-91wx" else: pytest.skip('DOI used in test is out of date.') def test_scicrunch_multiple_dataset_doi(client): # Testing with dataset 55 and 68 run_doi_test_1 = check_doi_status(client, "55", '10.26275/pzek-91wx') run_doi_test_2 = check_doi_status(client, "68", '10.26275/4qvr-kwzq') if run_doi_test_1 and run_doi_test_2: r = client.get('/dataset_info/using_multiple_dois/?dois=10.26275%2Fpzek-91wx&dois=10.26275%2F4qvr-kwzq') results = json.loads(r.data)['results'] dataset_version = results[0]['version'] if version.parse(dataset_version) >= version.parse("1.1.4"): discover_id_1 = results[0]['dataset_identifier'] discover_id_2 = results[1]['dataset_identifier'] assert discover_id_1 == "55" or discover_id_1 == "68" assert discover_id_2 == "55" or discover_id_2 == "68" else: pytest.skip('DOI used in test is out of date.') def test_scicrunch_multiple_dataset_ids(client): # Testing with dataset 55 and 68 r = client.get('/dataset_info/using_multiple_discoverIds/?discoverIds=55&discoverIds=68') results = json.loads(r.data)['results'] dataset_version = results[0]['version'] if version.parse(dataset_version) >= version.parse("1.1.4"): discover_id_1 = results[0]['dataset_identifier'] discover_id_2 = results[1]['dataset_identifier'] assert discover_id_1 == "55" or discover_id_1 == "68" assert discover_id_2 == "55" or discover_id_2 == "68" def test_scicrunch_search(client): r = client.get('/search/heart') assert r.status_code == 200 assert json.loads(r.data)['numberOfHits'] > 4 def test_scicrunch_all_data(client): r = client.get('/filter-search/') assert json.loads(r.data)['numberOfHits'] > 40 def test_scicrunch_filter(client): r = client.get('/filter-search/', query_string={'term': 'organ', 'facet': 'heart'}) assert json.loads(r.data)['numberOfHits'] > 4 def test_scicrunch_filter_scaffolds(client): r = client.get('/filter-search/?facet=scaffolds&term=datasets') assert json.loads(r.data)['numberOfHits'] > 10 def test_scicrunch_basic_search(client): r = client.get('/filter-search/Heart/?facet=All+Species&term=species') assert json.loads(r.data)['numberOfHits'] > 10 def test_scicrunch_image_search(client): r = client.get('/multiple_dataset_info/using_multiple_mimetype/?q="*jp2*+OR+*vnd.ome.xml*+OR+*jpx*"') assert json.loads(r.data)['numberOfHits'] > 5 def test_scicrunch_boolean_logic(client): r = client.get('/filter-search/?facet=All+Species&term=species&facet=male&term=gender&facet=female&term=gender') assert json.loads(r.data)['numberOfHits'] > 20 def test_scicrunch_combined_facet_text(client): r = client.get('/filter-search/heart/?facet=All+Species&term=species&facet=male&term=gender&facet=female&term=gender') assert json.loads(r.data)['numberOfHits'] > 1 def test_getting_facets(client): r = client.get('/get-facets/organ') facet_results = json.loads(r.data) facets = [facet_result['key'] for facet_result in facet_results] assert 'heart' in facets def test_create_identifier_query(client): r = client.get('/dataset_info/using_object_identifier?identifier=package:e6435710-dd9c-46b7-9dfd-932103469733') json_data = json.loads(r.data) assert 'result' in json_data results = json_data['result'] assert len(results) == 1 result = results[0] assert 'version' in result assert result['version'] == '1.2.1' assert 'title' in result assert result['title'] == 'Morphometric analysis of the abdominal vagus nerve in rats' def test_create_anatomy_query(client): r = client.get('/dataset_info/anatomy?identifier=90') json_data = json.loads(r.data) assert 'result' in json_data results = json_data['result'] assert len(results) == 1 result = results[0] assert 'anatomy' in result assert 'organ' in result['anatomy'] assert len(result['anatomy']['organ']) == 1 assert 'curie' in result['anatomy']['organ'][0] assert result['anatomy']['organ'][0]['curie'] == 'UBERON:0001759' assert 'item' in result assert 'curie' in result['item'] assert result['item']['curie'].startswith("DOI:") assert 'organisms' in result assert 'subject' in result['organisms'] assert len(result['organisms']['subject']) == 1 assert 'species' in result['organisms']['subject'][0] assert 'curie' in result['organisms']['subject'][0]['species'] assert result['organisms']['subject'][0]['species']['curie'] == 'NCBITaxon:10116' def test_response_version(client): # Testing with dataset 44 identifier = "44" doi = "10.26275/duz8-mq3n" run_doi_test = check_doi_status(client, identifier, doi) if run_doi_test: r = client.get('/dataset_info/using_doi', query_string={'doi': doi}) data = r.data.decode('utf-8') json_data = json.loads(data) assert len(json_data['result']) == 1 assert 'version' in json_data['result'][0] else: pytest.skip('DOI used in test is out of date.') def test_response_abi_plot(client): # Testing abi-plot with dataset 212 identifier = "212" doi = "10.26275/lok5-wje6" run_doi_test = check_doi_status(client, identifier, doi) if run_doi_test: r = client.get('/dataset_info/using_doi', query_string={'doi': doi}) data = r.data.decode('utf-8') json_data = json.loads(data) assert len(json_data['result']) == 1 if json_data['result'][0]['version'] == '1.1.5': assert len(json_data['result'][0]['abi-plot']) == 4 identifier = json_data['result'][0]["dataset_identifier"] version = json_data['result'][0]["dataset_version"] assert identifier == "212" assert version == "1" # Construct the file path prefix, it should be /exists/212/1/files path_prefix = '/'.join(('', 'exists', identifier, version, 'files')) for plot in json_data['result'][0]['abi-plot']: for path in plot['datacite']['isDescribedBy']['path']: if path: path = '/'.join((path_prefix, path)) # Check if the file exists using the /exists/{path} route r2 = client.get(path) data2 = r2.data.decode('utf-8') json_data2 = json.loads(data2) print(path) assert json_data2['exists'] == 'true' else: pytest.skip('Only test abi-plot against version 1.1.5.') else: pytest.skip('DOI used in test is out of date.') def test_response_abi_scaffold(client): # Testing abi-scaffold with dataset 76 identifier = "76" doi = "10.26275/jarb-s8jw" run_doi_test = check_doi_status(client, identifier, doi) if run_doi_test: r = client.get('/dataset_info/using_doi', query_string={'doi': doi}) data = r.data.decode('utf-8') json_data = json.loads(data) if len(json_data['result']) == 1: if json_data['result'][0]['version'] == '1.1.5': identifier = json_data['result'][0]["dataset_identifier"] dataset_version = json_data['result'][0]["dataset_version"] assert identifier == "76" assert dataset_version == "4" # Construct the file path prefix, it should be /exists/76/4/files path_prefix = '/'.join(('', 'exists', identifier, dataset_version, 'files')) assert len(json_data['result'][0]['abi-scaffold-metadata-file']) == 1 for plot in json_data['result'][0]['abi-scaffold-metadata-file']: for path in plot['datacite']['isSourceOf']['path']: if path: path = '/'.join((path_prefix, path)) # Check if the file exists using the /exists/{path} route r2 = client.get(path) data2 = r2.data.decode('utf-8') json_data2 = json.loads(data2) print(path) assert json_data2['exists'] == 'true' assert len(json_data['result'][0]['abi-scaffold-view-file']) == 4 for plot in json_data['result'][0]['abi-scaffold-view-file']: for path in plot['datacite']['isSourceOf']['path']: if path: path = '/'.join((path_prefix, path)) # Check if the file exists using the /exists/{path} route r2 = client.get(path) data2 = r2.data.decode('utf-8') json_data2 = json.loads(data2) print(path) assert json_data2['exists'] == 'true' assert len(json_data['result'][0]['abi-scaffold-thumbnail']) == 4 for plot in json_data['result'][0]['abi-scaffold-thumbnail']: for path in plot['datacite']['isDerivedFrom']['path']: if path: path = '/'.join((path_prefix, path)) # Check if the file exists using the /exists/{path} route r2 = client.get(path) data2 = r2.data.decode('utf-8') json_data2 = json.loads(data2) print(path) assert json_data2['exists'] == 'true' else: pytest.skip('Only test abi-plot against version 1.1.5.') else: pytest.skip('DOI used in test is out of date.') def test_response_sample_subject_size(client): # Only filter search returns the sample and subjectSuze r = client.get('/filter-search/?facet=liver&term=organ') data = r.data.decode('utf-8') json_data = json.loads(data) print(json_data) assert len(json_data['results']) == 1 assert json_data['results'][0]['sampleSize'] == '9' assert json_data['results'][0]['subjectSize'] == '9' source_structure = { 'type': dict, 'required': ['contributors', 'dataItem', {'item': { 'type': dict, 'required': ['types', 'contentTypes', 'statistics', 'keywords', 'published', 'description', 'name', 'identifier', 'docid', 'curie'], 'optional': [{'version': {'type': dict, 'required': ['keyword'], 'optional': []}}, 'techniques', 'readme', 'modalities', 'names'] }}, 'pennsieve', 'provenance', 'supportingAwards'], 'optional': ['anatomy', 'attributes', 'dates', 'diseases', 'distributions', {'objects': { 'type': list, 'item': { 'type': dict, 'required': ['bytes', 'dataset', 'identifier', 'mimetype', 'name', 'updated'], 'optional': ['distributions']} } }, 'organisms', 'protocols', 'publication', 'xrefs'] } raw_structure_base = { 'type': dict, 'required': [ {'hits': { 'type': dict, 'required': [ {'hits': {'type': list, 'item': { 'type': dict, 'required': ['_index', '_type', '_id', '_score', {'_source': source_structure} ], 'optional': ['_ignored']} } } ], 'optional': [], } } ], 'optional': [] } class StructureDefinitionError(Exception): pass def _test_sub_structure(data, structure, required=True): for st in structure: if isinstance(st, str): if required and st not in data: print(f'failed: {st}') return False continue # req should have exactly one key if not len(st.keys()) == 1: raise StructureDefinitionError key = next(iter(st)) if required and key not in data: print(f'key failed: {key}') return False # if key == '_source': # a = list(data[key].keys()) # a.sort() # print(a) if key in data and not _test_structure(data[key], st[key]): print(f'structure failed: {key} - {st[key]["type"]}, {type(data[key])} - {st[key]} - {len(data[key])}') return False return True def _test_structure(data, structure): structure_type = structure['type'] # print('=============================') # print(structure) if isinstance(data, structure_type): if structure_type is dict: if not _test_sub_structure(data, structure['required'], required=True): return False if not _test_sub_structure(data, structure['optional'], required=False): return False elif structure_type is list: for list_item in data: if not _test_structure(list_item, structure['item']): return False else: print('type if not dict or list', type(data)) return True return False def test_raw_response_structure(client): # 10.26275/zdxd-84xz # 10.26275/duz8-mq3n query = create_query_string("computational") data = dataset_search(query) # print(data['hits']['hits'][0]['_source']['objects']) # print(data['hits']['hits'][0]['_source']['item']) assert _test_structure(data, raw_structure_base) assert 'hits' in data assert 'hits' in data['hits'] assert isinstance(data['hits']['hits'], list) for hit in data['hits']['hits']: if 'version' in hit['_source']['item']: print(hit['_source']['item']['version']['keyword']) else: print('no version') for hit in data['hits']['hits']: print(hit['_source'].keys()) objects = data['hits']['hits'][0]['_source']['objects'] for o in objects: mimetype = o.get('mimetype', 'not-specified').get('name', 'no-name') # print('mimetype: ', mimetype) if mimetype == 'image/png': # print(o) print('.', end="") print() # for k in data['hits']['hits'][0]: # print(k, data['hits']['hits'][0][k]) def test_get_body_scaffold_info(client): # Test if we get a shorter list of uberons with species specified r = client.get('/get_body_scaffold_info/human') result = json.loads(r.data) assert result['id'] == '307' assert result['path'] == 'derivative/human_body_metadata.json' assert result['contextinfo'] == 'derivative/human_body_metadata_context_info.json' assert 'prd-sparc-discover50-use1' in result['s3uri'] def test_getting_curies(client): # Test if we get a shorter list of uberons with species specified r = client.get('/get-organ-curies/') uberons_results = json.loads(r.data) total = len(uberons_results['uberon']['array']) assert total > 0 r = client.get('/get-organ-curies/?species=human') uberons_results = json.loads(r.data) human = len(uberons_results['uberon']['array']) assert total > human # Test if the uberon - name match the one from the hardcoded list for item in uberons_results['uberon']['array']: if item['id'] in UBERONS_DICT: assert UBERONS_DICT[item['id']] == item['name'].lower() def test_get_related_terms(client): # Test if we can get the uberon term of heart using the uberon term # of left ventricle r = client.get('/get-related-terms/UBERON:0002084') uberons_results = json.loads(r.data) print(uberons_results) total = len(uberons_results['uberon']['array']) assert total > 0 findHeart = False for item in uberons_results['uberon']['array']: if item['id'] == 'UBERON:0000948' and item['name'] == 'heart': findHeart = True break assert findHeart == True def test_scaffold_files(client): r = client.get('/filter-search/?size=30') results = json.loads(r.data) assert results['numberOfHits'] > 0 for item in results['results']: if 'abi-scaffold-metadata-file' in item and 's3uri' in item: uri = item['s3uri'] path = item['abi-scaffold-metadata-file'][0]['dataset']['path'] key = re.sub(r"s3://[^/]*/", "", f"{uri}files/{path}") s3_bucket_name = re.sub(r"s3://|/.*", "", uri) r = client.get(f"/exists/{key}?s3BucketName={s3_bucket_name}") data = r.data.decode('utf-8') json_data = json.loads(data) assert json_data['exists'] == 'true' def test_finding_contextual_information(client): r = client.get('/dataset_info/using_multiple_discoverIds/?discoverIds=76') results = json.loads(r.data) assert results['numberOfHits'] > 0 # Test we could find the generic colon scaffold dataset for item in results['results']: assert len(item['abi-contextual-information']) > 0 # Check it has contextual information def test_undefined_version_dataset_search(client): # Testing with dataset 17 which is not versioned identifier = "17" doi = "10.26275/mlua-o9oj" r = client.get('/dataset_info/using_doi', query_string={'doi': doi}) data = r.data.decode('utf-8') json_data = json.loads(data) assert len(json_data['result']) == 1 assert 'dataset_identifier' in json_data['result'][0] assert json_data['result'][0]['dataset_identifier'] == '17'