|
1 | | -# Copyright (c) 2024 Microsoft Corporation. |
| 1 | +# Copyright (C) 2026 Microsoft |
2 | 2 | # Licensed under the MIT License |
3 | 3 |
|
4 | 4 | """Graph extraction using NLP.""" |
5 | 5 |
|
| 6 | +import logging |
| 7 | +from collections import defaultdict |
6 | 8 | from itertools import combinations |
7 | 9 |
|
8 | | -import numpy as np |
9 | 10 | import pandas as pd |
10 | 11 | from graphrag_cache import Cache |
| 12 | +from graphrag_storage.tables.table import Table |
11 | 13 |
|
12 | | -from graphrag.config.enums import AsyncType |
13 | 14 | from graphrag.graphs.edge_weights import calculate_pmi_edge_weights |
14 | 15 | from graphrag.index.operations.build_noun_graph.np_extractors.base import ( |
15 | 16 | BaseNounPhraseExtractor, |
16 | 17 | ) |
17 | | -from graphrag.index.utils.derive_from_rows import derive_from_rows |
18 | 18 | from graphrag.index.utils.hashing import gen_sha512_hash |
19 | 19 |
|
| 20 | +logger = logging.getLogger(__name__) |
| 21 | + |
20 | 22 |
|
21 | 23 | async def build_noun_graph( |
22 | | - text_unit_df: pd.DataFrame, |
| 24 | + text_unit_table: Table, |
23 | 25 | text_analyzer: BaseNounPhraseExtractor, |
24 | 26 | normalize_edge_weights: bool, |
25 | | - num_threads: int, |
26 | | - async_mode: AsyncType, |
27 | 27 | cache: Cache, |
28 | 28 | ) -> tuple[pd.DataFrame, pd.DataFrame]: |
29 | 29 | """Build a noun graph from text units.""" |
30 | | - text_units = text_unit_df.loc[:, ["id", "text"]] |
31 | | - nodes_df = await _extract_nodes( |
32 | | - text_units, |
| 30 | + title_to_ids = await _extract_nodes( |
| 31 | + text_unit_table, |
33 | 32 | text_analyzer, |
34 | | - num_threads=num_threads, |
35 | | - async_mode=async_mode, |
36 | 33 | cache=cache, |
37 | 34 | ) |
38 | | - edges_df = _extract_edges(nodes_df, normalize_edge_weights=normalize_edge_weights) |
| 35 | + |
| 36 | + nodes_df = pd.DataFrame( |
| 37 | + [ |
| 38 | + { |
| 39 | + "title": title, |
| 40 | + "frequency": len(ids), |
| 41 | + "text_unit_ids": ids, |
| 42 | + } |
| 43 | + for title, ids in title_to_ids.items() |
| 44 | + ], |
| 45 | + columns=["title", "frequency", "text_unit_ids"], |
| 46 | + ) |
| 47 | + |
| 48 | + edges_df = _extract_edges( |
| 49 | + title_to_ids, |
| 50 | + nodes_df=nodes_df, |
| 51 | + normalize_edge_weights=normalize_edge_weights, |
| 52 | + ) |
39 | 53 | return (nodes_df, edges_df) |
40 | 54 |
|
41 | 55 |
|
42 | 56 | async def _extract_nodes( |
43 | | - text_unit_df: pd.DataFrame, |
| 57 | + text_unit_table: Table, |
44 | 58 | text_analyzer: BaseNounPhraseExtractor, |
45 | | - num_threads: int, |
46 | | - async_mode: AsyncType, |
47 | 59 | cache: Cache, |
48 | | -) -> pd.DataFrame: |
49 | | - """ |
50 | | - Extract initial nodes and edges from text units. |
| 60 | +) -> dict[str, list[str]]: |
| 61 | + """Extract noun-phrase nodes from text units. |
51 | 62 |
|
52 | | - Input: text unit df with schema [id, text, document_id] |
53 | | - Returns a dataframe with schema [id, title, frequency, text_unit_ids]. |
| 63 | + NLP extraction is CPU-bound (spaCy/TextBlob), so threading |
| 64 | + provides no benefit under the GIL. We process rows |
| 65 | + sequentially, relying on the cache to skip repeated work. |
| 66 | +
|
| 67 | + Returns a mapping of noun-phrase title to text-unit ids. |
54 | 68 | """ |
55 | | - cache = cache.child("extract_noun_phrases") |
| 69 | + extraction_cache = cache.child("extract_noun_phrases") |
| 70 | + total = await text_unit_table.length() |
| 71 | + title_to_ids: dict[str, list[str]] = defaultdict(list) |
| 72 | + completed = 0 |
56 | 73 |
|
57 | | - async def extract(row): |
| 74 | + async for row in text_unit_table: |
| 75 | + text_unit_id = row["id"] |
58 | 76 | text = row["text"] |
| 77 | + |
59 | 78 | attrs = {"text": text, "analyzer": str(text_analyzer)} |
60 | 79 | key = gen_sha512_hash(attrs, attrs.keys()) |
61 | | - result = await cache.get(key) |
| 80 | + result = await extraction_cache.get(key) |
62 | 81 | if not result: |
63 | 82 | result = text_analyzer.extract(text) |
64 | | - await cache.set(key, result) |
65 | | - return result |
66 | | - |
67 | | - text_unit_df["noun_phrases"] = await derive_from_rows( # type: ignore |
68 | | - text_unit_df, |
69 | | - extract, |
70 | | - num_threads=num_threads, |
71 | | - async_type=async_mode, |
72 | | - progress_msg="extract noun phrases progress: ", |
73 | | - ) |
| 83 | + await extraction_cache.set(key, result) |
74 | 84 |
|
75 | | - noun_node_df = text_unit_df.explode("noun_phrases") |
76 | | - noun_node_df = noun_node_df.rename( |
77 | | - columns={"noun_phrases": "title", "id": "text_unit_id"} |
78 | | - ) |
| 85 | + for phrase in result: |
| 86 | + title_to_ids[phrase].append(text_unit_id) |
79 | 87 |
|
80 | | - # group by title and count the number of text units |
81 | | - grouped_node_df = ( |
82 | | - noun_node_df.groupby("title").agg({"text_unit_id": list}).reset_index() |
83 | | - ) |
84 | | - grouped_node_df = grouped_node_df.rename(columns={"text_unit_id": "text_unit_ids"}) |
85 | | - grouped_node_df["frequency"] = grouped_node_df["text_unit_ids"].apply(len) |
86 | | - grouped_node_df = grouped_node_df[["title", "frequency", "text_unit_ids"]] |
87 | | - return grouped_node_df.loc[:, ["title", "frequency", "text_unit_ids"]] |
| 88 | + completed += 1 |
| 89 | + if completed % 100 == 0 or completed == total: |
| 90 | + logger.info( |
| 91 | + "extract noun phrases progress: %d/%d", |
| 92 | + completed, |
| 93 | + total, |
| 94 | + ) |
| 95 | + |
| 96 | + return dict(title_to_ids) |
88 | 97 |
|
89 | 98 |
|
90 | 99 | def _extract_edges( |
| 100 | + title_to_ids: dict[str, list[str]], |
91 | 101 | nodes_df: pd.DataFrame, |
92 | 102 | normalize_edge_weights: bool = True, |
93 | 103 | ) -> pd.DataFrame: |
94 | | - """ |
95 | | - Extract edges from nodes. |
| 104 | + """Build co-occurrence edges between noun phrases. |
96 | 105 |
|
97 | | - Nodes appear in the same text unit are connected. |
98 | | - Input: nodes_df with schema [id, title, frequency, text_unit_ids] |
99 | | - Returns: edges_df with schema [source, target, weight, text_unit_ids] |
| 106 | + Nodes that appear in the same text unit are connected. |
| 107 | + Returns edges with schema [source, target, weight, text_unit_ids]. |
100 | 108 | """ |
101 | | - if nodes_df.empty: |
102 | | - return pd.DataFrame(columns=["source", "target", "weight", "text_unit_ids"]) |
103 | | - |
104 | | - text_units_df = nodes_df.explode("text_unit_ids") |
105 | | - text_units_df = text_units_df.rename(columns={"text_unit_ids": "text_unit_id"}) |
106 | | - text_units_df = ( |
107 | | - text_units_df |
108 | | - .groupby("text_unit_id") |
109 | | - .agg({"title": lambda x: list(x) if len(x) > 1 else np.nan}) |
110 | | - .reset_index() |
111 | | - ) |
112 | | - text_units_df = text_units_df.dropna() |
113 | | - titles = text_units_df["title"].tolist() |
114 | | - all_edges: list[list[tuple[str, str]]] = [list(combinations(t, 2)) for t in titles] |
115 | | - |
116 | | - text_units_df = text_units_df.assign(edges=all_edges) # type: ignore |
117 | | - edge_df = text_units_df.explode("edges")[["edges", "text_unit_id"]] |
118 | | - |
119 | | - edge_df[["source", "target"]] = edge_df.loc[:, "edges"].to_list() |
120 | | - edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1) |
121 | | - edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1) |
122 | | - edge_df = edge_df.drop(columns=["source", "target"]).rename( |
123 | | - columns={"min_source": "source", "max_target": "target"} # type: ignore |
| 109 | + if not title_to_ids: |
| 110 | + return pd.DataFrame( |
| 111 | + columns=["source", "target", "weight", "text_unit_ids"], |
| 112 | + ) |
| 113 | + |
| 114 | + text_unit_to_titles: dict[str, list[str]] = defaultdict(list) |
| 115 | + for title, tu_ids in title_to_ids.items(): |
| 116 | + for tu_id in tu_ids: |
| 117 | + text_unit_to_titles[tu_id].append(title) |
| 118 | + |
| 119 | + edge_map: dict[tuple[str, str], list[str]] = defaultdict(list) |
| 120 | + for tu_id, titles in text_unit_to_titles.items(): |
| 121 | + if len(titles) < 2: |
| 122 | + continue |
| 123 | + for pair in combinations(sorted(set(titles)), 2): |
| 124 | + edge_map[pair].append(tu_id) |
| 125 | + |
| 126 | + records = [ |
| 127 | + { |
| 128 | + "source": src, |
| 129 | + "target": tgt, |
| 130 | + "weight": len(tu_ids), |
| 131 | + "text_unit_ids": tu_ids, |
| 132 | + } |
| 133 | + for (src, tgt), tu_ids in edge_map.items() |
| 134 | + ] |
| 135 | + edges_df = pd.DataFrame( |
| 136 | + records, |
| 137 | + columns=["source", "target", "weight", "text_unit_ids"], |
124 | 138 | ) |
125 | 139 |
|
126 | | - edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())] |
127 | | - edge_df = edge_df.drop(columns=["edges"]) |
128 | | - # group by source and target, count the number of text units |
129 | | - grouped_edge_df = ( |
130 | | - edge_df.groupby(["source", "target"]).agg({"text_unit_id": list}).reset_index() |
131 | | - ) |
132 | | - grouped_edge_df = grouped_edge_df.rename(columns={"text_unit_id": "text_unit_ids"}) |
133 | | - grouped_edge_df["weight"] = grouped_edge_df["text_unit_ids"].apply(len) |
134 | | - grouped_edge_df = grouped_edge_df.loc[ |
135 | | - :, ["source", "target", "weight", "text_unit_ids"] |
136 | | - ] |
137 | | - if normalize_edge_weights: |
138 | | - # use PMI weight instead of raw weight |
139 | | - grouped_edge_df = calculate_pmi_edge_weights(nodes_df, grouped_edge_df) |
| 140 | + if normalize_edge_weights and not edges_df.empty: |
| 141 | + edges_df = calculate_pmi_edge_weights(nodes_df, edges_df) |
140 | 142 |
|
141 | | - return grouped_edge_df |
| 143 | + return edges_df |
0 commit comments