import unittest import requests import botocore import boto3 import json import urllib.parse import os from urllib.parse import urljoin from tests.config import Config error_report = {} doc_link = 'https://github.com/ABI-Software/scicrunch-knowledge-testing/tree/doc_v1' #the following should either be a falsy value or a string containg dataset number checkDatasetOnly = False s3 = boto3.client( "s3", aws_access_key_id=Config.AWS_KEY, aws_secret_access_key=Config.AWS_SECRET, region_name="us-east-1", ) S3_BUCKET_NAME = "pennsieve-prod-discover-publish-use1" CONTEXT_FILE = 'abi-context-file' PLOT_FILE = 'abi-plot' SCAFFOLD_FILE = 'abi-scaffold-metadata-file' SCAFFOLD_VIEW_FILE = 'abi-scaffold-view-file' THUMBNAIL_IMAGE = 'abi-thumbnail' NOT_SPECIFIED = 'not-specified' MIMETYPE_WITH_THUMBNAILS = [ PLOT_FILE, SCAFFOLD_FILE, SCAFFOLD_VIEW_FILE] TEST_MIME_TYPES = { 'application/x.vnd.abi.context-information+json': CONTEXT_FILE, 'application/x.vnd.abi.scaffold.meta+json': SCAFFOLD_FILE, 'application/x.vnd.abi.scaffold.view+json': SCAFFOLD_VIEW_FILE, 'image/x.vnd.abi.thumbnail+jpeg': THUMBNAIL_IMAGE, 'inode/vnd.abi.scaffold+file': SCAFFOLD_FILE, 'inode/vnd.abi.scaffold+thumbnail': THUMBNAIL_IMAGE, 'inode/vnd.abi.scaffold.thumbnail+file': THUMBNAIL_IMAGE, "text/vnd.abi.plot+thumbnail": THUMBNAIL_IMAGE, "inode/vnd.abi.plot+thumbnail": THUMBNAIL_IMAGE, 'inode/vnd.abi.scaffold.view+file': SCAFFOLD_VIEW_FILE, 'text/vnd.abi.plot+tab-separated-values': PLOT_FILE, 'text/vnd.abi.plot+csv': PLOT_FILE } def getDatasets(start, size): headers = {'accept': 'application/json'} params = {'api_key': Config.SCICRUNCH_API_KEY} scicrunch_host = Config.SCICRUNCH_API_HOST + '/' scicrunch_request = { "from": start, "size": size, #For checking specific dataset "_source": [ "item.curie", "item.name", "item.types", "objects.datacite", "objects.additional_mimetype", "objects.dataset", "pennsieve.version", "pennsieve.identifier", "pennsieve.uri" ] } if checkDatasetOnly: query = { "match": { "pennsieve.identifier.aggregate": { "query": checkDatasetOnly } } } scicrunch_request["query"] = query return requests.post(urljoin(scicrunch_host, '_search?preference=abiknowledgetesting'), json=scicrunch_request, params=params, headers=headers) def extract_bucket_name(original_name): return original_name.split('/')[2] def map_mime_type(mime_type): if mime_type == '': return NOT_SPECIFIED if mime_type == NOT_SPECIFIED: return NOT_SPECIFIED lower_mime_type = mime_type.lower() if lower_mime_type in TEST_MIME_TYPES: return TEST_MIME_TYPES[lower_mime_type] return NOT_SPECIFIED #Get file header response from s3 bucket def getFileResponse(localPath, path, mime_type, bucket): try: head_response = s3.head_object( Bucket=bucket, Key=path, RequestPayer="requester" ) if head_response and 'ResponseMetadata' in head_response \ and 200 == head_response['ResponseMetadata']['HTTPStatusCode']: pass else: return { 'Mimetype': mime_type, 'Path': localPath, 'Reason': 'Invalid response', 'ReasonDetails': doc_link + '#reason-invalid-response' } except botocore.exceptions.ClientError as error: return { 'Mimetype': mime_type, 'Path': localPath, 'Reason': f"{error}", 'ReasonDetails': doc_link + '#reason-an-error-occurred-404-when-calling-the-headobject-operation-not-found' } return None #Get the mimetype def getObjectMimeType(obj): mime_type = obj.get('additional_mimetype', NOT_SPECIFIED) if mime_type != NOT_SPECIFIED: mime_type = mime_type.get('name') return mime_type #Check if any of the item in isSourceOf is a thumbnail for the object def checkForThumbnail(obj, obj_list): local_mapped_type = map_mime_type(getObjectMimeType(obj)) if local_mapped_type == THUMBNAIL_IMAGE: #Thumbnail found return True elif local_mapped_type == SCAFFOLD_VIEW_FILE: if 'dataset' in obj and 'path' in obj['dataset']: localPath = obj['dataset']['path'] #Found view file, check for thumbnail if 'datacite' in obj and 'isSourceOf' in obj['datacite']: isSourceOf = obj['datacite']['isSourceOf'] if 'relative' in isSourceOf and 'path' in isSourceOf['relative']: for path in isSourceOf['relative']['path']: actualPath = urllib.parse.urljoin(localPath, path) found = next((i for i, item in enumerate(obj_list) if item['dataset']['path'] == actualPath), None) if found and map_mime_type(getObjectMimeType(obj_list[found])): return True return False #Generate report for datacite in the object def getDataciteReport(obj_list, obj, mapped_mimetype, filePath): keysToCheck = { 'isDerivedFrom': 0, 'isSourceOf': 0} reports = {'TotalErrors':0, 'ThumbnailError': 'None', 'ItemTested':0, 'isDerivedFrom': [], 'isSourceOf': [] } thumbnailFound = False if 'datacite' in obj: for key in keysToCheck: if key in obj['datacite']: keyObject = obj['datacite'][key] if 'relative' in keyObject and 'path' in keyObject['relative']: for path in keyObject['relative']['path']: keysToCheck[key] = keysToCheck[key] + 1 reports['ItemTested'] += 1 try: actualPath = urllib.parse.urljoin(filePath, path) found = next((i for i, item in enumerate(obj_list) if item['dataset']['path'] == actualPath), None) if found == None: reports[key].append( { 'RelativePath': path, 'Reason': 'Cannot find the path', 'ReasonDetails': doc_link + '#reason-cannot-find-the-path' } ) reports['TotalErrors'] +=1 elif key == 'isSourceOf': #Check for thumbnail thumbnailFound = checkForThumbnail(obj_list[found], obj_list) except: reports[key].append( { 'RelativePath': path, 'Reason': 'Encounter a problem while looking for path', 'ReasonDetails': doc_link + '#reason-encounter-a-problem-while-looking-for-path' } ) reports['TotalErrors'] +=1 if mapped_mimetype in MIMETYPE_WITH_THUMBNAILS: if keysToCheck['isSourceOf'] == 0: reports['ThumbnailError'] = 'Missing isSourceOf entry' reports['ThumbnailErrorDetails'] = doc_link + '#thumbnailerror-missing-issourceof-entry' reports['TotalErrors'] +=1 if thumbnailFound == False: reports['ThumbnailError'] = 'Thumbnail not found in isSourceOf' reports['ThumbnailErrorDetails'] = doc_link + '#thumbnailerror-thumbnail-not-found-in-issourceof' reports['TotalErrors'] +=1 return reports #Test object to check for any possible error def testObj(obj_list, obj, mime_type, mapped_mime_type, prefix, bucket): dataciteReport = None fileResponse = None if 'dataset' in obj and 'path' in obj['dataset']: localPath = obj['dataset']['path'] path = f"{prefix}/{localPath}" fileResponse = getFileResponse(localPath, path, mime_type, bucket) dataciteReport = getDataciteReport(obj_list, obj, mapped_mime_type, localPath) if dataciteReport['TotalErrors'] > 0: if fileResponse == None: fileResponse = { 'Mimetype': mime_type, 'Path': localPath, } fileResponse['DataciteReport'] = dataciteReport else: fileResponse = { 'Mimetype': mime_type, 'Path': 'Not found', 'Reason': 'Cannot find path', 'Reason': doc_link + '#reason-cannot-find-the-path' } return fileResponse def test_obj_list(id, version, obj_list, scaffoldTag, bucket): objectErrors = [] prefix = f"{id}/files" foundScaffold = False foundContextInfo = False datasetErrors = [] for obj in obj_list: mime_type = getObjectMimeType(obj) mapped_mime_type = map_mime_type(mime_type) if mapped_mime_type == NOT_SPECIFIED: pass else: if mapped_mime_type == SCAFFOLD_FILE: foundScaffold = True if mapped_mime_type == CONTEXT_FILE: foundContextInfo = True error = testObj(obj_list, obj, mime_type, mapped_mime_type, prefix, bucket) if error: objectErrors.append(error) if foundScaffold == True: if foundContextInfo == False: datasetErrors.append({ 'Reason': 'Contextual Information cannot be found while scaffold is present', 'Details': doc_link + '#contextual-information-cannot-be-found-while-scaffold-is-present' }) if scaffoldTag == False: datasetErrors.append({ 'Reason': 'Scaffold found in objects list but the dataset is not tagged with scaffold (types.item.name)', 'Details': doc_link + '#scaffold-found-in-objects-list-but-the-dataset-is-not-tagged-with-scaffold-typesitemname' }) elif scaffoldTag == True: datasetErrors.append({ 'Reason': 'Dataset is tagged with scaffold (types.item.name) but no scaffold can be found in the list of objects.', 'Details': doc_link + '#dataset-is-tagged-with-scaffold-typesitemname-but-no-scaffold-can-be-found-in-the-list-of-objects' }) numberOfErrors = len(objectErrors) fileReports = { 'Total': numberOfErrors, 'Objects': objectErrors } return {"FileReports": fileReports, "DatasetErrors": datasetErrors} #Test the dataset def test_datasets_information(dataset): scaffoldTag = False report = { 'Id': 'none', 'DOI': 'none', '_id': dataset['_id'], 'Errors': [], 'ObjectErrors': {'Total': 0, 'Objects':[]} } if '_source' in dataset : source = dataset['_source'] if 'item' in source: report['Name'] = source['item'].get('name', 'none') report['DOI'] = source['item'].get('curie', 'none') if 'types' in source['item']: for type in source['item']['types']: if 'name' in type and type['name'] == 'scaffold': scaffoldTag = True if 'pennsieve' in source and 'version' in source['pennsieve'] and 'identifier' in source['pennsieve']: id = source['pennsieve']['identifier'] version = source['pennsieve']['version']['identifier'] report['Id'] = id report['Version'] = version bucket = S3_BUCKET_NAME if 'uri' in source['pennsieve']: bucket = extract_bucket_name(source['pennsieve']['uri']) if version: if 'objects' in source: obj_list = source['objects'] obj_reports = test_obj_list(id, version, obj_list, scaffoldTag, bucket) report['ObjectErrors'] = obj_reports['FileReports'] report['Errors'].extend(obj_reports["DatasetErrors"]) else: report['Errors'].append('Missing version') return report class SciCrunchDatasetFilesTest(unittest.TestCase): def __init__(self, *args, **kwds): super().__init__(*args, **kwds) def test_files_information(self): start = 0 size = 20 keepGoing = True totalSize = 0 reportOutput = 'reports/error_reports.json' reports = {'Tested': 0, 'Failed': 0, 'FailedIds':[], 'Datasets':[]} while keepGoing: scicrunch_response = getDatasets(start, size) self.assertEqual(200, scicrunch_response.status_code) data = scicrunch_response.json() #No more result, stop if size > len(data['hits']['hits']): keepGoing = False #keepGoing= False start = start + size for dataset in data['hits']['hits']: report = test_datasets_information(dataset) print(f"Reports generated for {report['Id']}") if len(report['Errors']) > 0 or report['ObjectErrors']['Total'] > 0: reports['FailedIds'].append(report['Id']) reports['Datasets'].append(report) # Generate the report reports['Tested'] = totalSize print(f"Number of datasets tested: {reports['Tested']}") reports['Failed'] = len(reports['FailedIds']) print(f"Number of dataset with erros: {reports['Failed']}") if reports['Failed'] > 0: print(f"Failed Datasets: {reports['FailedIds']}") os.makedirs(os.path.dirname(reportOutput), exist_ok=True) with open(reportOutput, 'w') as outfile: json.dump(reports, outfile, indent=4) print(f"Full report has been generated at {reportOutput}") self.assertEqual(0, len(reports['FailedIds'])) if __name__ == '__main__': unittest.main()