mangecoeur · dhufe · Dec 14, 2021 · Dec 14, 2021 · Dec 15, 2021 · mangecoeur
diff --git a/Default.sublime-commands b/Default.sublime-commands
@@ -14,5 +14,9 @@
 	{
 		"caption": "Citer: Combine Citations",
 		"command": "citer_combine_citations"
-	}
+	},
+    {
+        "caption": "Citer: Extract used citations",
+        "command": "citer_extract_citations"
+    }
 ]
diff --git a/citer.py b/citer.py
@@ -23,7 +23,7 @@
 
 from bibtexparser.bparser import BibTexParser
 from bibtexparser.customization import convert_to_unicode
-
+from lib import md2bib
 
 # settings cache globals
 BIBFILE_PATH = None
@@ -111,7 +111,9 @@ def bibpath(self):
 
 def bibfile_modifed(bib_path):
     global _LST_MOD_TIME
-    bib_path = bib_path.strip()
+
+    bib_path = bib_path.replace('\'','').strip()
+
     last_modified_time = os.path.getmtime(bib_path)
     cached_modifed_time = _LST_MOD_TIME.get(bib_path)
     if cached_modifed_time is None or last_modified_time != cached_modifed_time:
@@ -126,8 +128,9 @@ def load_bibfile(bib_path):
         sublime.status_message("WARNING: No BibTex file configured for Citer")
         return {}
 
-    bib_path = bib_path.strip()
-    with open(bib_path, 'r', encoding="utf-8") as bibfile:
+    bib_path = bib_path.replace('\'','').strip()
+
+    with open( bib_path, 'r', encoding="utf-8") as bibfile:
         bp = BibTexParser(bibfile.read(),
                           customization=convert_to_unicode,
                           ignore_nonstandard_types=False)
@@ -156,14 +159,16 @@ def get_settings(setting, default):
             if setting == 'bibtex_file':
                 window = sublime.active_window()
                 ref_dir = os.path.dirname(window.project_file_name())
-                result = ref_dir + '/' + project_data['bibtex_file']
+                filename = project_data['bibtex_file']
+                result = ref_dir + '/' + filename.replace('\'', '')
                 return result
             else:
                 return project_data[setting]
         else:
             return settings.get(setting, default)
 
     settings = sublime.load_settings('Citer.sublime-settings')
+
     BIBFILE_PATH = get_settings('bibtex_file_path', None)
     SEARCH_IN = get_settings('search_fields', ["author", "title", "year", "id"])
     CITATION_FORMAT = get_settings('citation_format', "@%s")
@@ -388,6 +393,8 @@ class CiterCompleteCitationEventListener(sublime_plugin.EventListener):
 
     def on_query_completions(self, view, prefix, loc):
 
+        refresh_settings()
+
         in_scope = any(view.match_selector(loc[0], scope) for scope in COMPLETIONS_SCOPES)
         ex_scope = any(view.match_selector(loc[0], scope) for scope in EXCLUDED_SCOPES)
 
@@ -411,3 +418,18 @@ def run(self, edit):
         lstpos = self.view.find_all(r'\]\[')
         for i, pos in reversed(list(enumerate(lstpos))):
             self.view.replace(edit, pos, r'; ')
+
+
+class CiterExtractCitationsCommand(sublime_plugin.TextCommand):
+
+    def run(self, edit):
+        refresh_settings()
+        current_file = sublime.active_window().active_view().file_name()
+        # split off extension
+        basefile, extension = os.path.splitext(current_file)
+        bibsubset_file = basefile + '.bib'
+
+        md2bib.extract_bibliography(current_file, BIBFILE_PATH, bibsubset_file)
+        _, fname = os.path.split(bibsubset_file)
+
+        sublime.status_message('Extracted citations to {}'.format(fname))
diff --git a/lib/md2bib.py b/lib/md2bib.py
@@ -0,0 +1,118 @@
+"""
+Functions to extract bibliographic keys from a BibTeX file, using the
+keys used in a Markdown or LaTeX file.
+
+Modified from md2bib.py [1], which is (c) Copyright 2011-2012 by
+Joseph Reagle and licensed under the GPLv3.
+
+[1] https://github.com/reagle/pandoc-wrappers/
+
+"""
+
+from collections import OrderedDict
+import logging
+import re
+
+
+BIBKEY_PAT = '([.:;,\-\w]+)'
+
+
+def parse_bibtex(text):
+    """Return a dictionary of entry dictionaries, each with a field/value.
+    The parser is simple/fast *and* inflexible, unlike the proper but
+    slow parsers bibstuff and pyparsing-based parsers.
+
+    """
+    entries = OrderedDict()
+    key_pat = re.compile('@' + BIBKEY_PAT + '\{(.*),')
+    value_pat = re.compile('[\s]*(\w+)[\s]*=[\s]*{(.*)},?')
+    for line in text:
+        key_match = key_pat.match(line)
+        if key_match:
+            entry_type = key_match.group(1)
+            key = key_match.group(2)
+            entries[key] = OrderedDict({'entry_type': entry_type})
+            continue
+        value_match = value_pat.match(line)
+        if value_match:
+            field, value = value_match.groups()
+            entries[key][field] = value
+    return entries
+
+
+def emit_entry(identifier, values, outfd):
+    """Emit a single bibtex entry."""
+    outfd.write('@%s{%s,\n' % (values['entry_type'], identifier))
+    for field, value in values.items():
+        if field != 'entry_type':
+            outfd.write('    %s = {%s},\n' % (field, value))
+    outfd.write("}\n\n")
+
+
+def emit_bibliography(entries, outfd):
+    """Emit a bibtex file."""
+    for identifier, values in entries.items():
+        emit_entry(identifier, values, outfd)
+
+
+def subset_bibliography(entries, keys):
+    """Emit a subset of a bibtex file based on bibtex keys."""
+    subset = OrderedDict()
+    for key in sorted(keys):
+        if key in entries:
+            subset[key] = entries[key]
+        else:
+            logging.critical("%s not in entries" % key)
+            pass
+    return subset
+
+
+def get_keys_from_document(filename):
+    """Return a list of keys used in filename by looking for citations
+    like `@citekey`.
+
+    Also look for citations in the
+    `\cite*{key}` style, where `*` can be any character or none.
+
+    """
+    k_md = '\[@' + BIBKEY_PAT + '\]|(?<!\[)@' + BIBKEY_PAT
+    k_latex = '\\cite.?\[?(?:.+?)?\]?\{' + BIBKEY_PAT + '\}'
+
+    text = open(filename, 'r', encoding='utf-8').read()
+    md = re.findall(k_md, text)
+    md_brackets, md_intext = list(zip(*md))
+    md_brackets, md_intext = list(md_brackets), list(md_intext)
+
+    matches = []
+    # Split up finds if necessary to deal with [@key0; @key1]
+    for f in md_brackets:
+        if '@' in f:
+            sub_f = f.replace(' ', '').replace('@', '').split(';')
+            matches.extend(sub_f)
+        elif f != '':
+            matches.append(f)
+
+    matches.extend([i for i in md_intext if i != ''])
+
+    latex = re.findall(k_latex, text)
+    for f in latex:
+        if ',' in f:
+            sub_f = [i.strip() for i in f.split(',')]
+            matches.extend(sub_f)
+        elif f != '':
+            matches.append(f)
+
+    logging.debug('Found keys in document: ' + ', '.join(matches))
+    return matches
+
+
+def extract_bibliography(source_doc, source_bib, target_bib):
+    # Extract citation keys from source file
+    keys = get_keys_from_document(source_doc)
+    # Read source bibliography and generate subset
+    with open(source_bib, 'r', encoding='utf-8') as f:
+        entries = parse_bibtex(f.readlines())
+    subset = subset_bibliography(entries, keys)
+    # Write extracted subset to new bibliography file
+    with open(target_bib, 'w', encoding='utf-8') as f:
+        emit_bibliography(subset, f)