-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathArticleParser.h
More file actions
35 lines (29 loc) · 1014 Bytes
/
ArticleParser.h
File metadata and controls
35 lines (29 loc) · 1014 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#pragma once
#include "PageItem.h"
#include "VectorStorage.h"
#include <string>
#include <vector>
#include <future>
#include <list>
/*
This class is responsible for parsing JSON files containing articles
Relies on WikipediaSearch.py to generate JSON files from Wikipedia dumps
*/
// Parses JSON files containing articles and stores in vector storage
class ArticleParser {
private:
std::list<std::future<void>> activeTasks;
void flushBatch(std::vector<PageItem>& batch); // Flush a batch of PageItems to vector storage
std::string jsonPath; // relative path to JSON files
size_t batchSize; // batch size for processing, will input into DB after n articles
VectorStorage& storage; // reference to vector storage
int maxPages; // maximum number of pages to parse (-1 for no limit)
public:
ArticleParser(
const std::string& jsonPath,
size_t batchSize,
VectorStorage& storage,
int maxPages
);
void parseJSONFiles(); // Parse JSON files
};