diff --git a/readme.md b/readme.md index 41d7c31..00c1b16 100644 --- a/readme.md +++ b/readme.md @@ -46,11 +46,9 @@ Creating and downloading the spreadsheet: Normalizing the spreadsheet: -In the downloaded CSV, combine or split up the following fields: +** The script should normalize all of the following fields itself! ** BUT, if you encounter any trouble, you can also manually normalize the CSV by combining or splitting up the following fields: * Combine the columns 'Notes,' 'Alternative Title,' 'Credits' into a single column 'Notes' * Combine the columns 'Medium of original,' 'Dimensions of original,' 'Original video standard,' 'Generation' columns into a single column 'Medium of original' - * Normalize the 'frame rate' column into numbers only (e.g., remove the word 'fps') - * Split the 'video size' column into 'Video height' and 'Video width'; only use numbers (e.g., turn '640x480' into '640' and '480') * Add 'urn:bampfa_accession_number:' (no quotes) before each the accession number in the 'PFA full accession number' column diff --git a/rs2ia.py b/rs2ia.py index c41da9d..d75f786 100644 --- a/rs2ia.py +++ b/rs2ia.py @@ -119,6 +119,12 @@ def __init__( self._user = _user self.rsAPI = ResourceSpaceAPI(_user) + # initializing metadata attributes required/desired by IA that don't appear in RS + self.description = "" + self.source = "" + self.externalidentifier = "" + self.date = "" + def get_local_asset_path(self): # see https://www.resourcespace.com/knowledge-base/api/get_resource_path # construct parameters of API call as a string @@ -215,35 +221,90 @@ def post_to_ia(self): elif self.mediaType == 'mp3': ia_mediatype = 'audio' - md = { - # LET'S THINK ABOUT HOW TO MAKE THIS SET OF MD MORE AGNOSTIC/GENERALIZABLE + ''' + Normalizing columns from csv for use in metadata dict. + This is required because the RS and IA metadata fields do not directly + map to each other. Each field is initialized to "" so that the metadata + dict doesn't complain that it's missing; blank fields will be removed later. + ''' + # concatenate 'description' fields + if self.assetMetadata['Notes'] : # if the metadata field exists (as a string), then add it to the dictionary value + self.description = "Notes: " + self.assetMetadata['Notes'] + ". " + if self.assetMetadata['Description'] : + self.description = "Description: " + self.assetMetadata['Description'] + ". " + if self.assetMetadata['Alternative Title'] : + self.description += "Alternative Title: " + self.assetMetadata['Alternative Title'] + ". " + if self.assetMetadata['Credits'] : + self.description += "Credits: " + self.assetMetadata['Credits'] + # concatenate 'source' fields + if self.assetMetadata['Medium of original'] : + self.source = "Medium of original: " + self.assetMetadata['Medium of original'] + ". " + if self.assetMetadata['Dimensions of original'] : + self.source += "Dimensions of original: " + self.assetMetadata['Dimensions of original'] + ". " + # add 'urn:bampfa_accession_number:' to accession # (this conforms to IA style guide) + if self.assetMetadata['PFA full accession number'] : + self.externalidentifier = "urn:bampfa_accession_number:" + self.assetMetadata['PFA full accession number'] + if self.assetMetadata['Date of recording'] : + self.date = self.assetMetadata['Date of recording'] + elif self.assetMetadata['Release Date'] : + self.date = self.assetMetadata['Release Date'] + # audio- and video-specific fields + if ia_mediatype == 'movies' : + if self.assetMetadata['Original video format'] : + self.source += "Original video format: " + self.assetMetadata['Original video format'] + ". " + if self.assetMetadata['Original video standard'] : + self.source += "Original video standard: " + self.assetMetadata['Original video standard'] + ". " + if self.assetMetadata['Generation'] : + self.source += "Generation: " + self.assetMetadata['Generation'] + elif ia_mediatype == 'audio' : + if self.assetMetadata['PFA film series'] : + self.description = "Pacific Film Archive film series: " + self.assetMetadata['PFA film series'] + ". " + if self.assetMetadata['Event title'] : + self.description = "Event title: " + self.assetMetadata['Event title'] + ". " + if self.assetMetadata['Speaker/Interviewee'] : + self.description = "Speaker/Interviewee: " + self.assetMetadata['Speaker/Interviewee'] + ". " + if self.assetMetadata['Subject(s): Film title(s)'] : + self.description = "Subject(s): Film title(s): " + self.assetMetadata['Subject(s): Film title(s)'] + ". " + if self.assetMetadata['Subject(s): Topics(s)'] : + self.description = "Subject(s): Topics(s): " + self.assetMetadata['Subject(s): Topics(s)'] + ". " + + # general MD dict + general_md = { 'collection': self.collection, 'collection': self.collection2, # this overrides the previous line - 'rights': 'This is a rights statement', + 'rights': self.assetMetadata['Copyright statement'], 'mediatype': ia_mediatype, #'licenseurl': self.license, - 'creator': self.assetMetadata['Directors / Filmmakers'], 'contributor': self.assetMetadata['Resource type'], 'identifier': identifier, + 'external-identifier': self.externalidentifier, 'title': self.assetMetadata['Title'], - 'date': self.assetMetadata['Release Date'], - # Original columns 'Notes,' 'Alternative Title,' 'Credits' should be concatenated manually by operator into single column 'Notes' - 'description': self.assetMetadata['Notes'], - # Original columns 'Medium of original,' 'Dimensions of original,' 'Original video standard,' 'Generation' columns should be concatenated manually by operator into single column 'Medium of original' - 'source': self.assetMetadata['Medium of original'], - # 'frame rate' column should be normalized into numbers manually by operator - 'frames_per_second': self.assetMetadata['Frame rate'], - # 'video size' column should be split into 'Video height' and 'Video width' numbers manually by operator - # 'source_pixel_width': self.assetMetadata['Video height'], - # 'source_pixel_height': self.assetMetadata['Video width'], - # 'PFA full accession number' column should be normalized to 'urn:bampfa_accession_number:XXXX' manually by operator - 'external-identifier': self.assetMetadata['PFA full accession number'], - 'condition': self.assetMetadata['Original Material Condition'], + 'date': self.date, + 'description': self.description, + 'source': self.source, + 'language': self.assetMetadata['Language'] + } + movies_md = { 'sound': self.assetMetadata['PFA item sound characteristics'], - 'color': self.assetMetadata['Color characteristics'] + 'color': self.assetMetadata['Color characteristics'], + 'creator': self.assetMetadata['Directors / Filmmakers'], } + # audio_md = { + # # need appropriate creator field for audio; will all audio be PFA lecture series? + # } + + # concatenate dictionaries + if ia_mediatype == 'movies' : + md = dict(general_md) + md.update(movies_md) + elif ia_mediatype == 'audio' : + md = dict(general_md) + # get rid of empty values in the md dictionary md = {k: v for k, v in md.items() if v not in (None,'')} + # remove line breaks that display as literal "
" + md = {v.replace('
', ' '): k + for k, v in md.items()} # archive.org Python Library, 'uploading': https://archive.org/services/docs/api/internetarchive/quickstart.html#uploading print("ACCESS COPY FILENAME:") print(identifier)