From 9b08467ab60036cd1405ff5963b355b6a3015981 Mon Sep 17 00:00:00 2001 From: Hetvi Bagdai Date: Sat, 18 Oct 2025 08:03:37 +0000 Subject: [PATCH 1/5] Fix: Update ligra submodule with cstdint fix --- deps/ligra | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/ligra b/deps/ligra index 7755d95..a63488d 160000 --- a/deps/ligra +++ b/deps/ligra @@ -1 +1 @@ -Subproject commit 7755d95fbac4a587ee7c5920d1b927c545f97d07 +Subproject commit a63488d0341cf440dd514fd73099912585b2ea37 From 519b91e721c8cabfb640b48d506c05cbe9f2ce13 Mon Sep 17 00:00:00 2001 From: Hetvi Bagdai Date: Sat, 18 Oct 2025 08:37:23 +0000 Subject: [PATCH 2/5] Feat: Add parallel triangle counting algorithm --- script/cpu/triangle.sh | 7 ++++ src/apps_ligra/CMakeLists.txt | 4 +- src/apps_ligra/triangle.cpp | 74 +++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 script/cpu/triangle.sh create mode 100644 src/apps_ligra/triangle.cpp diff --git a/script/cpu/triangle.sh b/script/cpu/triangle.sh new file mode 100644 index 0000000..26abc00 --- /dev/null +++ b/script/cpu/triangle.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# Navigate to the script's directory +cd "$(dirname "$0")" + +# Run the triangle counting executable (with -r 1 for one round) +../../bin/triangle_cpu -r 1 ../../dataset/cnr-2000/ligra/cnr-2000.txt diff --git a/src/apps_ligra/CMakeLists.txt b/src/apps_ligra/CMakeLists.txt index e80a922..c634584 100644 --- a/src/apps_ligra/CMakeLists.txt +++ b/src/apps_ligra/CMakeLists.txt @@ -15,4 +15,6 @@ target_compile_definitions(topo_cpu PRIVATE ${LIGRA_FLAG}) target_compile_definitions(hits_cpu PRIVATE ${LIGRA_FLAG}) - +add_executable(triangle_cpu triangle.cpp) +target_compile_definitions(triangle_cpu PRIVATE ${LIGRA_FLAG}) +target_include_directories(triangle_cpu PRIVATE ${PROJECT_SOURCE_DIR}) diff --git a/src/apps_ligra/triangle.cpp b/src/apps_ligra/triangle.cpp new file mode 100644 index 0000000..ea57089 --- /dev/null +++ b/src/apps_ligra/triangle.cpp @@ -0,0 +1,74 @@ +#include "deps/ligra/ligra/ligra.h" +#include + +template +void Compute(graph& GA, commandLine P) { + long n = GA.n; + std::atomic triangle_count(0); + + parallel_for (long u = 0; u < n; u++) { + if (GA.V[u].getOutDegree() == 0) continue; + + bool* u_neighbors_set = newA(bool, n); + parallel_for(long i=0; i& count; + graph& GA; + + Intersect_F(long _u, long _n, bool* _set, std::atomic& _c, graph& _GA) + : u(_u), n(_n), u_set(_set), count(_c), GA(_GA) {} + + inline bool update(uintE s, uintE v) { + if (v > u) { + + struct Check_Ngh_F { + long v; + bool* u_set; + std::atomic& count; + + Check_Ngh_F(long _v, bool* _set, std::atomic& _c) + : v(_v), u_set(_set), count(_c) {} + + inline bool update(uintE s, uintE w) { + if (u_set[w]) { + count.fetch_add(1); + } + return true; + } + inline bool updateAtomic(uintE s, uintE w) { return update(s,w); } + inline bool cond(uintE w) { return (w > v); } + }; + + vertexSubset v_frontier(n, v); + edgeMap(GA, v_frontier, Check_Ngh_F(v, u_set, count), n, false); + } + return true; + } + inline bool updateAtomic(uintE s, uintE v) { return update(s,v); } + inline bool cond(uintE v) { return true; } + }; + + edgeMap(GA, u_frontier, Intersect_F(u, n, u_neighbors_set, triangle_count, GA), n, false); + + u_frontier.del(); + free(u_neighbors_set); + } + + // We don't need the buggy timer, Ligra prints its own. + cout << "Triangle Count: " << triangle_count.load() << endl; +} From a26569a530aaa3c1ae0616550f21b8f4ce86de02 Mon Sep 17 00:00:00 2001 From: Hetvi Bagdai Date: Sat, 18 Oct 2025 09:05:58 +0000 Subject: [PATCH 3/5] Feat: Add script to analyze compression threshold trade-off --- script/analyze_threshold.sh | 71 +++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 script/analyze_threshold.sh diff --git a/script/analyze_threshold.sh b/script/analyze_threshold.sh new file mode 100644 index 0000000..b93bd94 --- /dev/null +++ b/script/analyze_threshold.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# --- Configuration --- +THRESHOLDS=(8 16 32 64 128) +BIN_DIR="../bin" +DATA_DIR="../dataset" + +# --- Source File Paths --- +# This is the graph created by data_prepare.sh (Ligra format) +LIGRA_GRAPH_FILE="${DATA_DIR}/cnr-2000/ligra/cnr-2000.txt" # Path from bfs.sh +# These are the original CSR binaries found in the /compress subdirectory +ORIGINAL_VLIST="../dataset/cnr-2000/compress/csr_vlist.bin" +ORIGINAL_ELIST="../dataset/cnr-2000/compress/csr_elist.bin" +ORIGINAL_INFO="../dataset/cnr-2000/compress/info.bin" + +# Create a temporary directory for our test files +TEMP_DIR="../temp_analysis" +mkdir -p $TEMP_DIR + +# --- Check for baseline files --- +# Ensure the .bin files exist in the correct subdirectory +if [ ! -f "$ORIGINAL_VLIST" ]; then + echo "Error: Original files (vlist.bin, etc.) not found in ../dataset/cnr-2000/compress/" # Updated error message path + echo "Please run 'bash data_prepare.sh' once to generate them." + rm -rf $TEMP_DIR + exit 1 +fi + +# Get the size of the original uncompressed CSR graph for comparison +BASELINE_SIZE=$(($(stat -c%s "$ORIGINAL_VLIST") + $(stat -c%s "$ORIGINAL_ELIST"))) + +echo "Baseline Uncompressed CSR Size: $BASELINE_SIZE bytes" +echo "--- Starting Analysis ---" +echo "threshold,filtered_size_bytes,size_ratio,bfs_time_seconds" + +# --- Main Loop --- +for t in "${THRESHOLDS[@]}"; do + echo -n "$t," # Print threshold + + # 1. Copy original binaries to temp dir + cp "$ORIGINAL_VLIST" "$TEMP_DIR/vlist.bin" + cp "$ORIGINAL_ELIST" "$TEMP_DIR/elist.bin" + cp "$ORIGINAL_INFO" "$TEMP_DIR/info.bin" + + # 2. Run filter with the current threshold + # We run this from inside TEMP_DIR so it finds the files + (cd $TEMP_DIR && $BIN_DIR/filter vlist.bin elist.bin info.bin $t) > /dev/null 2>&1 + + # 3. Measure filtered size + FILTERED_SIZE=$(($(stat -c%s "$TEMP_DIR/vlist.bin") + $(stat -c%s "$TEMP_DIR/elist.bin"))) + echo -n "$FILTERED_SIZE," + + # Calculate and print size ratio + # Ensure bc is installed: sudo apt install bc + SIZE_RATIO=$(echo "scale=4; $FILTERED_SIZE / $BASELINE_SIZE" | bc) + echo -n "$SIZE_RATIO," + + # 4. Prepare Ligra data from the new filtered binaries + # Output the .ligra file to the temp directory + $BIN_DIR/convert2ligra "$TEMP_DIR/vlist.bin" "$TEMP_DIR/elist.bin" > "$TEMP_DIR/graph.ligra" + + # 5. Run and time BFS (run 3 times, get 3rd run's time) + # Pass the .ligra file generated in the temp dir + BFS_TIME=$($BIN_DIR/bfs_cpu -r 3 "$TEMP_DIR/graph.ligra" | grep "Running time" | tail -n 1 | cut -d' ' -f4) + echo "$BFS_TIME" # Last value, print newline + +done + +# --- Cleanup --- +rm -rf $TEMP_DIR +echo "--- Analysis Complete ---" From 4d5961da948f56b1e8654751dc5af23bcdb4d2ad Mon Sep 17 00:00:00 2001 From: Hetvi Bagdai Date: Sat, 18 Oct 2025 10:34:36 +0000 Subject: [PATCH 4/5] Feat: Add script to run full pipeline on new datasets --- script/run_full_pipeline.sh | 60 +++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 script/run_full_pipeline.sh diff --git a/script/run_full_pipeline.sh b/script/run_full_pipeline.sh new file mode 100644 index 0000000..7a848cf --- /dev/null +++ b/script/run_full_pipeline.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# --- Safety Check --- +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + echo "Example: ./run_full_pipeline.sh ca-GrQc" + exit 1 +fi + +# --- Configuration --- +GRAPH_NAME=$1 +BIN_DIR="../bin" +DATA_DIR="../dataset" +GRAPH_EDGELIST="${DATA_DIR}/${GRAPH_NAME}.edgelist" + +# Create a results directory for this graph +RESULTS_DIR="../${GRAPH_NAME}_results" +mkdir -p $RESULTS_DIR + +if [ ! -f "$GRAPH_EDGELIST" ]; then + echo "Error: Edgelist not found at $GRAPH_EDGELIST" + exit 1 +fi + +echo "--- Processing $GRAPH_NAME ---" +cd $RESULTS_DIR # Run everything from this new directory + +# --- Step 1: Edgelist to CSR --- +echo "Converting edgelist to CSR..." +# Pipe the edgelist into the converter. It outputs files in the current dir. +cat $GRAPH_EDGELIST | $BIN_DIR/edgelist2csr +# Creates csr_vlist.bin and csr_elist.bin + +# --- Step 2: Compress --- +echo "Compressing graph..." +# Compress reads and *overwrites* its inputs, so we run it on our files +# We redirect stderr (2>) to a file to capture the stats +$BIN_DIR/compress csr_vlist.bin csr_elist.bin 2> compress_stats.txt +echo "Compression stats:" +cat compress_stats.txt + +# --- Step 3: Filter (use default 16) --- +echo "Filtering rules..." +$BIN_DIR/filter csr_vlist.bin csr_elist.bin info.bin 16 > /dev/null 2>&1 + +# --- Step 4: Data Prep for Ligra --- +echo "Preparing data for Ligra..." +$BIN_DIR/convert2ligra csr_vlist.bin csr_elist.bin > ${GRAPH_NAME}.ligra +$BIN_DIR/save_degree csr_vlist.bin > ${GRAPH_NAME}.degree +$BIN_DIR/gene_rule_order csr_vlist.bin csr_elist.bin info.bin > ${GRAPH_NAME}.order + +# --- Step 5: Run Analytics (BFS and PageRank) --- +echo "Running BFS..." +$BIN_DIR/bfs_cpu -r 1 ${GRAPH_NAME}.ligra + +echo "Running PageRank..." +$BIN_DIR/pagerank_cpu -maxiters 10 -i info.bin -d ${GRAPH_NAME}.degree -o ${GRAPH_NAME}.order ${GRAPH_NAME}.ligra + +echo "--- Finished $GRAPH_NAME ---" +echo "All results are in $RESULTS_DIR" From 1b263a553925a3b8d6711339bdbcf5fe33a65f80 Mon Sep 17 00:00:00 2001 From: Hetvi Bagdai Date: Sat, 18 Oct 2025 10:36:10 +0000 Subject: [PATCH 5/5] Docs: Add documentation for project extensions --- README.md | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 70424a7..bf57542 100644 --- a/README.md +++ b/README.md @@ -185,4 +185,54 @@ If you use our code, please cite our paper: ``` - +## 6. Project Extensions (by hetvi3012) + +This section details additional features and analyses added to the original CompressGraph framework. All scripts mentioned are located in the `script` directory or its subdirectories. + +### 6.1 Triangle Counting (CPU) + +* **Goal:** Implements a parallel triangle counting algorithm that operates directly on the CompressGraph representation using the Ligra framework. +* **Compilation:** The `triangle_cpu` executable is built automatically when compiling with `-DLIGRA=ON`. Ensure you have applied the `` header fix to the `deps/ligra` code. +* **How to Run:** + ```bash + cd script/cpu + bash triangle.sh + ``` +* **Output:** Prints the total triangle count and the execution time to the console. + +### 6.2 Compression Threshold Analysis + +* **Goal:** Analyzes the trade-off between the compression rule filter threshold, the resulting graph size (compression ratio), and the performance of BFS. +* **How to Run:** + 1. Ensure the baseline `.bin` files exist in `dataset/cnr-2000/compress/` by running `bash script/data_prepare.sh` once. + 2. Run the analysis script: + ```bash + cd script + ./analyze_threshold.sh + ``` +* **Output:** Prints a CSV table to the console showing `threshold`, `filtered_size_bytes`, `size_ratio`, and `bfs_time_seconds`. This data can be used to plot the trade-off curve. + +### 6.3 Testing on Different Graph Types + +* **Goal:** Evaluates the effectiveness of CompressGraph's rule-based compression and analytics performance on graph structures different from the default web graph. Tests on a collaboration network (`ca-GrQc`) and a road network (`roadNet-CA`). +* **Data Preparation:** + 1. Download the datasets (run from the `dataset` directory): + ```bash + # Collaboration Network + wget [https://snap.stanford.edu/data/ca-GrQc.txt.gz](https://snap.stanford.edu/data/ca-GrQc.txt.gz) + gunzip ca-GrQc.txt.gz + grep -v "^#" ca-GrQc.txt > ca-GrQc.edgelist + + # Road Network + wget [https://snap.stanford.edu/data/roadNet-CA.txt.gz](https://snap.stanford.edu/data/roadNet-CA.txt.gz) + gunzip roadNet-CA.txt.gz + grep -v "^#" roadNet-CA.txt > roadNet-CA.edgelist + ``` +* **How to Run:** + Use the `run_full_pipeline.sh` script, providing the base name of the graph edgelist file in the `dataset` directory. Run from the `script` directory: + ```bash + cd script + ./run_full_pipeline.sh ca-GrQc + ./run_full_pipeline.sh roadNet-CA + ``` +* **Output:** Creates a results directory for each graph (e.g., `ca-GrQc_results`, `roadNet-CA_results`) containing intermediate files, compression stats (`compress_stats.txt`), and prints analytics output to the console. Compare the compression ratios and run times against the `cnr-2000` graph. Note: PageRank may fail on these graphs.