import unittest import requests import json import urllib.parse import os from urllib.parse import urljoin from tests.config import Config pennsieveCache = {} doc_link = 'https://github.com/ABI-Software/scicrunch-knowledge-testing/tree/doc_v1' S3_BUCKET_NAME = "pennsieve-prod-discover-publish-use1" NOT_SPECIFIED = 'not-specified' def getDatasets(start, size): headers = {'accept': 'application/json'} params = {'api_key': Config.SCICRUNCH_API_KEY} scicrunch_host = Config.SCICRUNCH_API_HOST + '/' scicrunch_request = { "from": start, "size": size, "_source": [ "item.curie", "item.name", "item.types", "objects.biolucida", "objects.additional_mimetype", "objects.dataset", "pennsieve.version", "pennsieve.identifier", "pennsieve.uri" ] } return requests.post(urljoin(scicrunch_host, '_search?preference=abiknowledgetesting'), json=scicrunch_request, params=params, headers=headers) def extract_bucket_name(original_name): return original_name.split('/')[2] #Test object to check for any possible error def testBiolucida(id, version, obj, biolucida_id, bucket): fileResponse = None global pennsieveCache localPath = obj['dataset']['path'] try: biolucida_response = requests.get(f'{Config.BIOLUCIDA_ENDPOINT}/image/{biolucida_id}') if not biolucida_response.status_code == 200: return { 'Path': localPath, 'biolucida_id': biolucida_id, 'Reason': 'Cannot get a valid request from Biolucida', } image_info = biolucida_response.json() if image_info['status'] == "permission denied": return { 'Path': localPath, 'biolucida_id': biolucida_id, 'Reason': 'Biolucida permission denied', } #Check if file path is consistent between scicrunch and biolucida if not 'name' in image_info or not image_info['name'] in localPath: fileResponse = { 'Path': localPath, 'biolucida_id': biolucida_id, 'Reason': 'Conflict between biolucida and scicrunch response', } else: #now check if the file path is consistent between Pennsieve and #the other two filePath = "files/" + localPath folderPath = filePath.rsplit("/", 1)[0] files = [] if folderPath in pennsieveCache: files = pennsieveCache[folderPath] else: fileUrl = '{api}/datasets/{id}/versions/{version}/files/browse?path={folderPath}'.format( api=Config.PENNSIEVE_API_HOST, id=id, version=version, folderPath=folderPath) file_response = requests.get(fileUrl) files_info = file_response.json() #print(files_info) if 'files' in files_info: files = files_info['files'] if len(files) > 0: pennsieveCache[folderPath] = files lPath = filePath.lower() foundFile = False for localFile in files: if lPath == localFile['path'].lower(): foundFile = True break elif 'uri' in localFile: uriFile = localFile['uri'].rsplit("/", 1)[0] if uriFile: uriFile = uriFile.lower() if uriFile in filePath: foundFile = True break if not foundFile: fileResponse = { 'Path': localPath, 'biolucida_id': biolucida_id, 'Reason': 'File path cannot be found on Pennsieve', } else: fileResponse = { 'Path': localPath, 'biolucida_id': biolucida_id, 'Reason': 'Folder path cannot be found on Pennsieve', } except Exception as e: fileResponse = { 'Path': localPath, 'biolucida_id': biolucida_id, 'Reason': str(e) } return fileResponse def test_biolucida_list(id, version, obj_list, bucket): objectErrors = [] foundBiolucida = False global pennsieveCache pennsieveCache = {} datasetErrors = [] biolucidaIDFound = False biolucidaInfoFound = False biolucidaFound = False biolucida_response = requests.get(f'{Config.BIOLUCIDA_ENDPOINT}/imagemap/search_dataset/discover/{id}') if biolucida_response.status_code == 200: dataset_info = biolucida_response.json() if 'status' in dataset_info and dataset_info['status'] == "success": biolucidaInfoFound = True for obj in obj_list: biolucida = obj.get('biolucida', NOT_SPECIFIED) if biolucida != NOT_SPECIFIED: biolucida_id = biolucida.get('identifier') if biolucida_id: biolucidaIDFound = True error = testBiolucida(id, version, obj, biolucida_id, bucket) if error: objectErrors.append(error) if biolucidaIDFound or biolucidaInfoFound: biolucidaFound = True if biolucidaIDFound and not biolucidaInfoFound: datasetErrors.append({ 'Reason': 'One or more Biolucida ID found in SciCrunch but no image information is found on biolucida server.' }) if not biolucidaIDFound and biolucidaInfoFound: datasetErrors.append({ 'Reason': 'Image information found on Biolucida server but no image id is found on SciCrunch.' }) numberOfErrors = len(objectErrors) fileReports = { 'Total': numberOfErrors, 'Objects': objectErrors } return {"FileReports": fileReports, "DatasetErrors": datasetErrors, "BiolucidaFound": biolucidaFound} #Test the dataset def test_datasets_information(dataset): report = { 'Id': 'none', 'DOI': 'none', '_id': dataset['_id'], 'Errors': [], 'ObjectErrors': {'Total': 0, 'Objects':[]} } if '_source' in dataset : source = dataset['_source'] if 'item' in source: report['Name'] = source['item'].get('name', 'none') report['DOI'] = source['item'].get('curie', 'none') if 'pennsieve' in source and 'version' in source['pennsieve'] and 'identifier' in source['pennsieve']: id = source['pennsieve']['identifier'] version = source['pennsieve']['version']['identifier'] report['Id'] = id report['Version'] = version bucket = S3_BUCKET_NAME if 'uri' in source['pennsieve']: bucket = extract_bucket_name(source['pennsieve']['uri']) if version: if 'objects' in source: obj_list = source['objects'] obj_reports = test_biolucida_list(id, version, obj_list, bucket) report['ObjectErrors'] = obj_reports['FileReports'] report['Errors'].extend(obj_reports["DatasetErrors"]) report['Biolucida'] = obj_reports['BiolucidaFound'] else: report['Errors'].append('Missing version') return report class BiolucidaDatasetFilesTest(unittest.TestCase): def __init__(self, *args, **kwds): super().__init__(*args, **kwds) def test_files_information(self): start = 0 size = 20 keepGoing = True totalSize = 0 reportOutput = 'reports/biolucida_reports.json' reports = {'Tested': 0, 'Failed': 0, 'FailedIds':[], 'Datasets':[]} testSize = 2000 totalBiolucida = 0 while keepGoing : scicrunch_response = getDatasets(start, size) self.assertEqual(200, scicrunch_response.status_code) data = scicrunch_response.json() #No more result, stop if size > len(data['hits']['hits']): keepGoing = False #keepGoing= False start = start + size for dataset in data['hits']['hits']: report = test_datasets_information(dataset) if 'Biolucida' in report and report['Biolucida']: totalBiolucida = totalBiolucida + 1 print(f"Reports generated for {report['Id']}") if len(report['Errors']) > 0 or report['ObjectErrors']['Total'] > 0: reports['FailedIds'].append(report['Id']) reports['Datasets'].append(report) totalSize = totalSize + len(data['hits']['hits']) if totalSize >= testSize: keepGoing = False # Generate the report reports['Tested'] = totalSize reports['Tested Datasets with Biolucida'] = totalBiolucida print(f"Number of datasets tested: {reports['Tested']}") reports['Failed'] = len(reports['FailedIds']) print(f"Number of dataset with erros: {reports['Failed']}") if reports['Failed'] > 0: print(f"Failed Datasets: {reports['FailedIds']}") os.makedirs(os.path.dirname(reportOutput), exist_ok=True) with open(reportOutput, 'w') as outfile: json.dump(reports, outfile, indent=4) print(f"Full report has been generated at {reportOutput}") self.assertEqual(0, len(reports['FailedIds'])) if __name__ == '__main__': unittest.main()