From f3ae5ec79238f3150c11ffa19e80656bdb16cb7b Mon Sep 17 00:00:00 2001 From: aeschweik Date: Mon, 3 Feb 2020 13:12:13 -0800 Subject: [PATCH 1/8] More automation for metadata CSV Automatic concatenation of columns, plus adding/removing strings from other columns. Needs review by someone who actually knows Python as it could be more elegant (though this does seem to work). There are a lot of "if" statements because I didn't want to assign the same generic text to objects without those fields (e.g. uploads shouldn't say "urn:bampfa_accession_number:" if there's no accession # at the end of it). --- rs2ia.py | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/rs2ia.py b/rs2ia.py index c41da9d..d542f9c 100644 --- a/rs2ia.py +++ b/rs2ia.py @@ -215,6 +215,35 @@ def post_to_ia(self): elif self.mediaType == 'mp3': ia_mediatype = 'audio' + # normalizing columns from csv for use in metadata dict + # "if" statement: if the metadata field exists (as a string), then add it to the dictionary value + # initializing each field to "" so that metadata dict doesn't complain that it's missing; blank fields will be removed later + # concatenate 'description' fields + self.description = "" + if self.assetMetadata['Notes'] : + self.description = "Notes: " + self.assetMetadata['Notes'] + "; " + if self.assetMetadata['Alternative Title'] : + self.description += "Alternative Title: " + self.assetMetadata['Alternative Title'] + "; " + if self.assetMetadata['Credits'] : + self.description += "Credits: " + self.assetMetadata['Credits'] + # concatenate 'source' fields + self.source = "" + if self.assetMetadata['Medium of original'] : + self.source = "Medium of original: " + self.assetMetadata['Medium of original'] + "; " + if self.assetMetadata['Dimensions of original'] : + self.source += "Dimensions of original: " + self.assetMetadata['Dimensions of original'] + "; " + if self.assetMetadata['Original video standard'] : + self.source += "Original video standard: " + self.assetMetadata['Original video standard'] + "; " + if self.assetMetadata['Generation'] : + self.source += "Generation: " + self.assetMetadata['Generation'] + # remove "fps" and leading/trailing spaces from 'frame rate' column + self.frames_per_second = "" + self.frames_per_second = self.assetMetadata['Frame rate'].strip(' fps').strip() + # add 'urn:bampfa_accession_number:' to accession # (this conforms to IA style guide) + self.externalidentifier = "" + if self.assetMetadata['PFA full accession number'] : + self.externalidentifier = "urn:bampfa_accession_number:" + self.assetMetadata['PFA full accession number'] + md = { # LET'S THINK ABOUT HOW TO MAKE THIS SET OF MD MORE AGNOSTIC/GENERALIZABLE 'collection': self.collection, @@ -227,17 +256,13 @@ def post_to_ia(self): 'identifier': identifier, 'title': self.assetMetadata['Title'], 'date': self.assetMetadata['Release Date'], - # Original columns 'Notes,' 'Alternative Title,' 'Credits' should be concatenated manually by operator into single column 'Notes' - 'description': self.assetMetadata['Notes'], - # Original columns 'Medium of original,' 'Dimensions of original,' 'Original video standard,' 'Generation' columns should be concatenated manually by operator into single column 'Medium of original' - 'source': self.assetMetadata['Medium of original'], - # 'frame rate' column should be normalized into numbers manually by operator - 'frames_per_second': self.assetMetadata['Frame rate'], + 'description': self.description, + 'source': self.source, + 'frames_per_second': self.frames_per_second, # 'video size' column should be split into 'Video height' and 'Video width' numbers manually by operator # 'source_pixel_width': self.assetMetadata['Video height'], # 'source_pixel_height': self.assetMetadata['Video width'], - # 'PFA full accession number' column should be normalized to 'urn:bampfa_accession_number:XXXX' manually by operator - 'external-identifier': self.assetMetadata['PFA full accession number'], + 'external-identifier': self.externalidentifier, 'condition': self.assetMetadata['Original Material Condition'], 'sound': self.assetMetadata['PFA item sound characteristics'], 'color': self.assetMetadata['Color characteristics'] From c706089a8333495170481dd23c6f6aaed2b67144 Mon Sep 17 00:00:00 2001 From: aeschweik Date: Mon, 3 Feb 2020 16:50:32 -0800 Subject: [PATCH 2/8] CSV: Split video size into video height/width --- rs2ia.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/rs2ia.py b/rs2ia.py index d542f9c..c9fea20 100644 --- a/rs2ia.py +++ b/rs2ia.py @@ -243,6 +243,12 @@ def post_to_ia(self): self.externalidentifier = "" if self.assetMetadata['PFA full accession number'] : self.externalidentifier = "urn:bampfa_accession_number:" + self.assetMetadata['PFA full accession number'] + # split 'video size' column into 'Video height' and 'Video width' numbers + self.videoSize = "" + if self.assetMetadata['Video size'] : + self.videoSize = self.assetMetadata['Video size'].split("x") + self.videoWidth = self.videoSize[0] + self.videoHeight = self.videoSize[1] md = { # LET'S THINK ABOUT HOW TO MAKE THIS SET OF MD MORE AGNOSTIC/GENERALIZABLE @@ -259,9 +265,8 @@ def post_to_ia(self): 'description': self.description, 'source': self.source, 'frames_per_second': self.frames_per_second, - # 'video size' column should be split into 'Video height' and 'Video width' numbers manually by operator - # 'source_pixel_width': self.assetMetadata['Video height'], - # 'source_pixel_height': self.assetMetadata['Video width'], + 'source_pixel_width': self.videoWidth, + 'source_pixel_height': self.videoHeight, 'external-identifier': self.externalidentifier, 'condition': self.assetMetadata['Original Material Condition'], 'sound': self.assetMetadata['PFA item sound characteristics'], From 629c185e56dfe3d295eb00510ce001aa7003cccc Mon Sep 17 00:00:00 2001 From: aeschweik Date: Mon, 3 Feb 2020 16:50:52 -0800 Subject: [PATCH 3/8] CSV/metadata: commenting/documentation updates --- readme.md | 2 +- rs2ia.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/readme.md b/readme.md index 41d7c31..0626b95 100644 --- a/readme.md +++ b/readme.md @@ -46,7 +46,7 @@ Creating and downloading the spreadsheet: Normalizing the spreadsheet: -In the downloaded CSV, combine or split up the following fields: +** The script should normalize all of the following fields itself! ** BUT, if you encounter any trouble, you can also manually normalize the CSV by combining or splitting up the following fields: * Combine the columns 'Notes,' 'Alternative Title,' 'Credits' into a single column 'Notes' * Combine the columns 'Medium of original,' 'Dimensions of original,' 'Original video standard,' 'Generation' columns into a single column 'Medium of original' * Normalize the 'frame rate' column into numbers only (e.g., remove the word 'fps') diff --git a/rs2ia.py b/rs2ia.py index c9fea20..ad3e2b5 100644 --- a/rs2ia.py +++ b/rs2ia.py @@ -215,12 +215,15 @@ def post_to_ia(self): elif self.mediaType == 'mp3': ia_mediatype = 'audio' - # normalizing columns from csv for use in metadata dict - # "if" statement: if the metadata field exists (as a string), then add it to the dictionary value - # initializing each field to "" so that metadata dict doesn't complain that it's missing; blank fields will be removed later + ''' + Normalizing columns from csv for use in metadata dict. + This is required because the RS and IA metadata fields do not directly + map to each other. Each field is initialized to "" so that the metadata + dict doesn't complain that it's missing; blank fields will be removed later. + ''' # concatenate 'description' fields self.description = "" - if self.assetMetadata['Notes'] : + if self.assetMetadata['Notes'] : # if the metadata field exists (as a string), then add it to the dictionary value self.description = "Notes: " + self.assetMetadata['Notes'] + "; " if self.assetMetadata['Alternative Title'] : self.description += "Alternative Title: " + self.assetMetadata['Alternative Title'] + "; " From 2bc5452109c169bbeddfff635f00e5b01efb4985 Mon Sep 17 00:00:00 2001 From: aeschweik Date: Tue, 4 Feb 2020 10:46:12 -0800 Subject: [PATCH 4/8] Remove variables defined by IA software 'frames_per_second', 'source_pixel_width', and 'source_pixel_height' are defined by IA software (not the person uploading), as per IA metadata schema https://archive.org/services/docs/api/metadata-schema/index.html --- readme.md | 2 -- rs2ia.py | 12 ------------ 2 files changed, 14 deletions(-) diff --git a/readme.md b/readme.md index 0626b95..00c1b16 100644 --- a/readme.md +++ b/readme.md @@ -49,8 +49,6 @@ Normalizing the spreadsheet: ** The script should normalize all of the following fields itself! ** BUT, if you encounter any trouble, you can also manually normalize the CSV by combining or splitting up the following fields: * Combine the columns 'Notes,' 'Alternative Title,' 'Credits' into a single column 'Notes' * Combine the columns 'Medium of original,' 'Dimensions of original,' 'Original video standard,' 'Generation' columns into a single column 'Medium of original' - * Normalize the 'frame rate' column into numbers only (e.g., remove the word 'fps') - * Split the 'video size' column into 'Video height' and 'Video width'; only use numbers (e.g., turn '640x480' into '640' and '480') * Add 'urn:bampfa_accession_number:' (no quotes) before each the accession number in the 'PFA full accession number' column diff --git a/rs2ia.py b/rs2ia.py index ad3e2b5..dd9bb9a 100644 --- a/rs2ia.py +++ b/rs2ia.py @@ -239,19 +239,10 @@ def post_to_ia(self): self.source += "Original video standard: " + self.assetMetadata['Original video standard'] + "; " if self.assetMetadata['Generation'] : self.source += "Generation: " + self.assetMetadata['Generation'] - # remove "fps" and leading/trailing spaces from 'frame rate' column - self.frames_per_second = "" - self.frames_per_second = self.assetMetadata['Frame rate'].strip(' fps').strip() # add 'urn:bampfa_accession_number:' to accession # (this conforms to IA style guide) self.externalidentifier = "" if self.assetMetadata['PFA full accession number'] : self.externalidentifier = "urn:bampfa_accession_number:" + self.assetMetadata['PFA full accession number'] - # split 'video size' column into 'Video height' and 'Video width' numbers - self.videoSize = "" - if self.assetMetadata['Video size'] : - self.videoSize = self.assetMetadata['Video size'].split("x") - self.videoWidth = self.videoSize[0] - self.videoHeight = self.videoSize[1] md = { # LET'S THINK ABOUT HOW TO MAKE THIS SET OF MD MORE AGNOSTIC/GENERALIZABLE @@ -267,9 +258,6 @@ def post_to_ia(self): 'date': self.assetMetadata['Release Date'], 'description': self.description, 'source': self.source, - 'frames_per_second': self.frames_per_second, - 'source_pixel_width': self.videoWidth, - 'source_pixel_height': self.videoHeight, 'external-identifier': self.externalidentifier, 'condition': self.assetMetadata['Original Material Condition'], 'sound': self.assetMetadata['PFA item sound characteristics'], From 9356bf21ac7186521e4648190f03fa3d41c85296 Mon Sep 17 00:00:00 2001 From: aeschweik Date: Tue, 4 Feb 2020 10:48:38 -0800 Subject: [PATCH 5/8] Initialize metadata variables as blank "" Moved to one location in script. (variables not pulled directly from metadata dict do not exist unless initialized as blank strings, so the script will throw an error when trying to use them in the dict (instead of just ignoring)) --- rs2ia.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rs2ia.py b/rs2ia.py index dd9bb9a..774483d 100644 --- a/rs2ia.py +++ b/rs2ia.py @@ -119,6 +119,11 @@ def __init__( self._user = _user self.rsAPI = ResourceSpaceAPI(_user) + # initializing metadata attributes required/desired by IA that don't appear in RS + self.description = "" + self.source = "" + self.externalidentifier = "" + def get_local_asset_path(self): # see https://www.resourcespace.com/knowledge-base/api/get_resource_path # construct parameters of API call as a string @@ -222,7 +227,6 @@ def post_to_ia(self): dict doesn't complain that it's missing; blank fields will be removed later. ''' # concatenate 'description' fields - self.description = "" if self.assetMetadata['Notes'] : # if the metadata field exists (as a string), then add it to the dictionary value self.description = "Notes: " + self.assetMetadata['Notes'] + "; " if self.assetMetadata['Alternative Title'] : @@ -230,7 +234,6 @@ def post_to_ia(self): if self.assetMetadata['Credits'] : self.description += "Credits: " + self.assetMetadata['Credits'] # concatenate 'source' fields - self.source = "" if self.assetMetadata['Medium of original'] : self.source = "Medium of original: " + self.assetMetadata['Medium of original'] + "; " if self.assetMetadata['Dimensions of original'] : @@ -240,7 +243,6 @@ def post_to_ia(self): if self.assetMetadata['Generation'] : self.source += "Generation: " + self.assetMetadata['Generation'] # add 'urn:bampfa_accession_number:' to accession # (this conforms to IA style guide) - self.externalidentifier = "" if self.assetMetadata['PFA full accession number'] : self.externalidentifier = "urn:bampfa_accession_number:" + self.assetMetadata['PFA full accession number'] From c17c80b73413e48190d2cac0cb12929b93e07df7 Mon Sep 17 00:00:00 2001 From: aeschweik Date: Tue, 4 Feb 2020 15:18:36 -0800 Subject: [PATCH 6/8] add more metadata fields Based on metadata fields used by sample of 300 objects in PFA's ResourceSpace. --- rs2ia.py | 59 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/rs2ia.py b/rs2ia.py index 774483d..7588fb7 100644 --- a/rs2ia.py +++ b/rs2ia.py @@ -123,6 +123,7 @@ def __init__( self.description = "" self.source = "" self.externalidentifier = "" + self.date = "" def get_local_asset_path(self): # see https://www.resourcespace.com/knowledge-base/api/get_resource_path @@ -229,6 +230,8 @@ def post_to_ia(self): # concatenate 'description' fields if self.assetMetadata['Notes'] : # if the metadata field exists (as a string), then add it to the dictionary value self.description = "Notes: " + self.assetMetadata['Notes'] + "; " + if self.assetMetadata['Description'] : + self.description = "Description: " + self.assetMetadata['Description'] + "; " if self.assetMetadata['Alternative Title'] : self.description += "Alternative Title: " + self.assetMetadata['Alternative Title'] + "; " if self.assetMetadata['Credits'] : @@ -238,33 +241,65 @@ def post_to_ia(self): self.source = "Medium of original: " + self.assetMetadata['Medium of original'] + "; " if self.assetMetadata['Dimensions of original'] : self.source += "Dimensions of original: " + self.assetMetadata['Dimensions of original'] + "; " - if self.assetMetadata['Original video standard'] : - self.source += "Original video standard: " + self.assetMetadata['Original video standard'] + "; " - if self.assetMetadata['Generation'] : - self.source += "Generation: " + self.assetMetadata['Generation'] # add 'urn:bampfa_accession_number:' to accession # (this conforms to IA style guide) if self.assetMetadata['PFA full accession number'] : self.externalidentifier = "urn:bampfa_accession_number:" + self.assetMetadata['PFA full accession number'] + if self.assetMetadata['Date of recording'] : + self.date = self.assetMetadata['Date of recording'] + elif self.assetMetadata['Release Date'] : + self.date = self.assetMetadata['Release Date'] + # audio- and video-specific fields + if ia_mediatype == 'movies' : + if self.assetMetadata['Original video format'] : + self.source += "Original video format: " + self.assetMetadata['Original video format'] + "; " + if self.assetMetadata['Original video standard'] : + self.source += "Original video standard: " + self.assetMetadata['Original video standard'] + "; " + if self.assetMetadata['Generation'] : + self.source += "Generation: " + self.assetMetadata['Generation'] + elif ia_mediatype == 'audio' : + if self.assetMetadata['PFA film series'] : + self.description = "Pacific Film Archive film series: " + self.assetMetadata['PFA film series'] + "; " + if self.assetMetadata['Event title'] : + self.description = "Event title: " + self.assetMetadata['Event title'] + "; " + if self.assetMetadata['Speaker/Interviewee'] : + self.description = "Speaker/Interviewee: " + self.assetMetadata['Speaker/Interviewee'] + "; " + if self.assetMetadata['Subject(s): Film title(s)'] : + self.description = "Subject(s): Film title(s): " + self.assetMetadata['Subject(s): Film title(s)'] + "; " + if self.assetMetadata['Subject(s): Topics(s)'] : + self.description = "Subject(s): Topics(s): " + self.assetMetadata['Subject(s): Topics(s)'] + "; " - md = { - # LET'S THINK ABOUT HOW TO MAKE THIS SET OF MD MORE AGNOSTIC/GENERALIZABLE + # general MD dict + general_md = { 'collection': self.collection, 'collection': self.collection2, # this overrides the previous line - 'rights': 'This is a rights statement', + 'rights': self.assetMetadata['Copyright statement'], 'mediatype': ia_mediatype, #'licenseurl': self.license, - 'creator': self.assetMetadata['Directors / Filmmakers'], 'contributor': self.assetMetadata['Resource type'], 'identifier': identifier, + 'external-identifier': self.externalidentifier, 'title': self.assetMetadata['Title'], - 'date': self.assetMetadata['Release Date'], + 'date': self.date, 'description': self.description, 'source': self.source, - 'external-identifier': self.externalidentifier, - 'condition': self.assetMetadata['Original Material Condition'], + 'language': self.assetMetadata['Language'] + } + movies_md = { 'sound': self.assetMetadata['PFA item sound characteristics'], - 'color': self.assetMetadata['Color characteristics'] + 'color': self.assetMetadata['Color characteristics'], + 'creator': self.assetMetadata['Directors / Filmmakers'], } + # audio_md = { + # # need appropriate creator field for audio; will all audio be PFA lecture series? + # } + + # concatenate dictionaries + if ia_mediatype == 'movies' : + md = dict(general_md) + md.update(movies_md) + elif ia_mediatype == 'audio' : + md = dict(general_md) + # get rid of empty values in the md dictionary md = {k: v for k, v in md.items() if v not in (None,'')} # archive.org Python Library, 'uploading': https://archive.org/services/docs/api/internetarchive/quickstart.html#uploading From 80c038bdf171f9c1c1da1f1453c16b2bad60b383 Mon Sep 17 00:00:00 2001 From: aeschweik Date: Tue, 4 Feb 2020 15:25:11 -0800 Subject: [PATCH 7/8] Strip "
" from values in metadata dict --- rs2ia.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rs2ia.py b/rs2ia.py index 7588fb7..e583c3a 100644 --- a/rs2ia.py +++ b/rs2ia.py @@ -302,6 +302,9 @@ def post_to_ia(self): # get rid of empty values in the md dictionary md = {k: v for k, v in md.items() if v not in (None,'')} + # remove line breaks that display as literal "
" + md = {v.replace('
', ' '): k + for k, v in md.items()} # archive.org Python Library, 'uploading': https://archive.org/services/docs/api/internetarchive/quickstart.html#uploading print("ACCESS COPY FILENAME:") print(identifier) From d41e04c81ed5b7316b1fa5d5758ed74ab7bfbd74 Mon Sep 17 00:00:00 2001 From: aeschweik Date: Tue, 4 Feb 2020 15:30:00 -0800 Subject: [PATCH 8/8] change "; " to ". " in concatenated metadata values an aura of finality --- rs2ia.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/rs2ia.py b/rs2ia.py index e583c3a..d75f786 100644 --- a/rs2ia.py +++ b/rs2ia.py @@ -229,18 +229,18 @@ def post_to_ia(self): ''' # concatenate 'description' fields if self.assetMetadata['Notes'] : # if the metadata field exists (as a string), then add it to the dictionary value - self.description = "Notes: " + self.assetMetadata['Notes'] + "; " + self.description = "Notes: " + self.assetMetadata['Notes'] + ". " if self.assetMetadata['Description'] : - self.description = "Description: " + self.assetMetadata['Description'] + "; " + self.description = "Description: " + self.assetMetadata['Description'] + ". " if self.assetMetadata['Alternative Title'] : - self.description += "Alternative Title: " + self.assetMetadata['Alternative Title'] + "; " + self.description += "Alternative Title: " + self.assetMetadata['Alternative Title'] + ". " if self.assetMetadata['Credits'] : self.description += "Credits: " + self.assetMetadata['Credits'] # concatenate 'source' fields if self.assetMetadata['Medium of original'] : - self.source = "Medium of original: " + self.assetMetadata['Medium of original'] + "; " + self.source = "Medium of original: " + self.assetMetadata['Medium of original'] + ". " if self.assetMetadata['Dimensions of original'] : - self.source += "Dimensions of original: " + self.assetMetadata['Dimensions of original'] + "; " + self.source += "Dimensions of original: " + self.assetMetadata['Dimensions of original'] + ". " # add 'urn:bampfa_accession_number:' to accession # (this conforms to IA style guide) if self.assetMetadata['PFA full accession number'] : self.externalidentifier = "urn:bampfa_accession_number:" + self.assetMetadata['PFA full accession number'] @@ -251,22 +251,22 @@ def post_to_ia(self): # audio- and video-specific fields if ia_mediatype == 'movies' : if self.assetMetadata['Original video format'] : - self.source += "Original video format: " + self.assetMetadata['Original video format'] + "; " + self.source += "Original video format: " + self.assetMetadata['Original video format'] + ". " if self.assetMetadata['Original video standard'] : - self.source += "Original video standard: " + self.assetMetadata['Original video standard'] + "; " + self.source += "Original video standard: " + self.assetMetadata['Original video standard'] + ". " if self.assetMetadata['Generation'] : self.source += "Generation: " + self.assetMetadata['Generation'] elif ia_mediatype == 'audio' : if self.assetMetadata['PFA film series'] : - self.description = "Pacific Film Archive film series: " + self.assetMetadata['PFA film series'] + "; " + self.description = "Pacific Film Archive film series: " + self.assetMetadata['PFA film series'] + ". " if self.assetMetadata['Event title'] : - self.description = "Event title: " + self.assetMetadata['Event title'] + "; " + self.description = "Event title: " + self.assetMetadata['Event title'] + ". " if self.assetMetadata['Speaker/Interviewee'] : - self.description = "Speaker/Interviewee: " + self.assetMetadata['Speaker/Interviewee'] + "; " + self.description = "Speaker/Interviewee: " + self.assetMetadata['Speaker/Interviewee'] + ". " if self.assetMetadata['Subject(s): Film title(s)'] : - self.description = "Subject(s): Film title(s): " + self.assetMetadata['Subject(s): Film title(s)'] + "; " + self.description = "Subject(s): Film title(s): " + self.assetMetadata['Subject(s): Film title(s)'] + ". " if self.assetMetadata['Subject(s): Topics(s)'] : - self.description = "Subject(s): Topics(s): " + self.assetMetadata['Subject(s): Topics(s)'] + "; " + self.description = "Subject(s): Topics(s): " + self.assetMetadata['Subject(s): Topics(s)'] + ". " # general MD dict general_md = {