Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,4 +185,54 @@ If you use our code, please cite our paper:
```



## 6. Project Extensions (by hetvi3012)

This section details additional features and analyses added to the original CompressGraph framework. All scripts mentioned are located in the `script` directory or its subdirectories.

### 6.1 Triangle Counting (CPU)

* **Goal:** Implements a parallel triangle counting algorithm that operates directly on the CompressGraph representation using the Ligra framework.
* **Compilation:** The `triangle_cpu` executable is built automatically when compiling with `-DLIGRA=ON`. Ensure you have applied the `<cstdint>` header fix to the `deps/ligra` code.
* **How to Run:**
```bash
cd script/cpu
bash triangle.sh
```
* **Output:** Prints the total triangle count and the execution time to the console.

### 6.2 Compression Threshold Analysis

* **Goal:** Analyzes the trade-off between the compression rule filter threshold, the resulting graph size (compression ratio), and the performance of BFS.
* **How to Run:**
1. Ensure the baseline `.bin` files exist in `dataset/cnr-2000/compress/` by running `bash script/data_prepare.sh` once.
2. Run the analysis script:
```bash
cd script
./analyze_threshold.sh
```
* **Output:** Prints a CSV table to the console showing `threshold`, `filtered_size_bytes`, `size_ratio`, and `bfs_time_seconds`. This data can be used to plot the trade-off curve.

### 6.3 Testing on Different Graph Types

* **Goal:** Evaluates the effectiveness of CompressGraph's rule-based compression and analytics performance on graph structures different from the default web graph. Tests on a collaboration network (`ca-GrQc`) and a road network (`roadNet-CA`).
* **Data Preparation:**
1. Download the datasets (run from the `dataset` directory):
```bash
# Collaboration Network
wget [https://snap.stanford.edu/data/ca-GrQc.txt.gz](https://snap.stanford.edu/data/ca-GrQc.txt.gz)
gunzip ca-GrQc.txt.gz
grep -v "^#" ca-GrQc.txt > ca-GrQc.edgelist

# Road Network
wget [https://snap.stanford.edu/data/roadNet-CA.txt.gz](https://snap.stanford.edu/data/roadNet-CA.txt.gz)
gunzip roadNet-CA.txt.gz
grep -v "^#" roadNet-CA.txt > roadNet-CA.edgelist
```
* **How to Run:**
Use the `run_full_pipeline.sh` script, providing the base name of the graph edgelist file in the `dataset` directory. Run from the `script` directory:
```bash
cd script
./run_full_pipeline.sh ca-GrQc
./run_full_pipeline.sh roadNet-CA
```
* **Output:** Creates a results directory for each graph (e.g., `ca-GrQc_results`, `roadNet-CA_results`) containing intermediate files, compression stats (`compress_stats.txt`), and prints analytics output to the console. Compare the compression ratios and run times against the `cnr-2000` graph. Note: PageRank may fail on these graphs.
2 changes: 1 addition & 1 deletion deps/ligra
Submodule ligra updated from 7755d9 to a63488
71 changes: 71 additions & 0 deletions script/analyze_threshold.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash

# --- Configuration ---
THRESHOLDS=(8 16 32 64 128)
BIN_DIR="../bin"
DATA_DIR="../dataset"

# --- Source File Paths ---
# This is the graph created by data_prepare.sh (Ligra format)
LIGRA_GRAPH_FILE="${DATA_DIR}/cnr-2000/ligra/cnr-2000.txt" # Path from bfs.sh
# These are the original CSR binaries found in the /compress subdirectory
ORIGINAL_VLIST="../dataset/cnr-2000/compress/csr_vlist.bin"
ORIGINAL_ELIST="../dataset/cnr-2000/compress/csr_elist.bin"
ORIGINAL_INFO="../dataset/cnr-2000/compress/info.bin"

# Create a temporary directory for our test files
TEMP_DIR="../temp_analysis"
mkdir -p $TEMP_DIR

# --- Check for baseline files ---
# Ensure the .bin files exist in the correct subdirectory
if [ ! -f "$ORIGINAL_VLIST" ]; then
echo "Error: Original files (vlist.bin, etc.) not found in ../dataset/cnr-2000/compress/" # Updated error message path
echo "Please run 'bash data_prepare.sh' once to generate them."
rm -rf $TEMP_DIR
exit 1
fi

# Get the size of the original uncompressed CSR graph for comparison
BASELINE_SIZE=$(($(stat -c%s "$ORIGINAL_VLIST") + $(stat -c%s "$ORIGINAL_ELIST")))

echo "Baseline Uncompressed CSR Size: $BASELINE_SIZE bytes"
echo "--- Starting Analysis ---"
echo "threshold,filtered_size_bytes,size_ratio,bfs_time_seconds"

# --- Main Loop ---
for t in "${THRESHOLDS[@]}"; do
echo -n "$t," # Print threshold

# 1. Copy original binaries to temp dir
cp "$ORIGINAL_VLIST" "$TEMP_DIR/vlist.bin"
cp "$ORIGINAL_ELIST" "$TEMP_DIR/elist.bin"
cp "$ORIGINAL_INFO" "$TEMP_DIR/info.bin"

# 2. Run filter with the current threshold
# We run this from inside TEMP_DIR so it finds the files
(cd $TEMP_DIR && $BIN_DIR/filter vlist.bin elist.bin info.bin $t) > /dev/null 2>&1

# 3. Measure filtered size
FILTERED_SIZE=$(($(stat -c%s "$TEMP_DIR/vlist.bin") + $(stat -c%s "$TEMP_DIR/elist.bin")))
echo -n "$FILTERED_SIZE,"

# Calculate and print size ratio
# Ensure bc is installed: sudo apt install bc
SIZE_RATIO=$(echo "scale=4; $FILTERED_SIZE / $BASELINE_SIZE" | bc)
echo -n "$SIZE_RATIO,"

# 4. Prepare Ligra data from the new filtered binaries
# Output the .ligra file to the temp directory
$BIN_DIR/convert2ligra "$TEMP_DIR/vlist.bin" "$TEMP_DIR/elist.bin" > "$TEMP_DIR/graph.ligra"

# 5. Run and time BFS (run 3 times, get 3rd run's time)
# Pass the .ligra file generated in the temp dir
BFS_TIME=$($BIN_DIR/bfs_cpu -r 3 "$TEMP_DIR/graph.ligra" | grep "Running time" | tail -n 1 | cut -d' ' -f4)
echo "$BFS_TIME" # Last value, print newline

done

# --- Cleanup ---
rm -rf $TEMP_DIR
echo "--- Analysis Complete ---"
7 changes: 7 additions & 0 deletions script/cpu/triangle.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

# Navigate to the script's directory
cd "$(dirname "$0")"

# Run the triangle counting executable (with -r 1 for one round)
../../bin/triangle_cpu -r 1 ../../dataset/cnr-2000/ligra/cnr-2000.txt
60 changes: 60 additions & 0 deletions script/run_full_pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash

# --- Safety Check ---
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <graph_name_in_dataset_dir_no_ext>"
echo "Example: ./run_full_pipeline.sh ca-GrQc"
exit 1
fi

# --- Configuration ---
GRAPH_NAME=$1
BIN_DIR="../bin"
DATA_DIR="../dataset"
GRAPH_EDGELIST="${DATA_DIR}/${GRAPH_NAME}.edgelist"

# Create a results directory for this graph
RESULTS_DIR="../${GRAPH_NAME}_results"
mkdir -p $RESULTS_DIR

if [ ! -f "$GRAPH_EDGELIST" ]; then
echo "Error: Edgelist not found at $GRAPH_EDGELIST"
exit 1
fi

echo "--- Processing $GRAPH_NAME ---"
cd $RESULTS_DIR # Run everything from this new directory

# --- Step 1: Edgelist to CSR ---
echo "Converting edgelist to CSR..."
# Pipe the edgelist into the converter. It outputs files in the current dir.
cat $GRAPH_EDGELIST | $BIN_DIR/edgelist2csr
# Creates csr_vlist.bin and csr_elist.bin

# --- Step 2: Compress ---
echo "Compressing graph..."
# Compress reads and *overwrites* its inputs, so we run it on our files
# We redirect stderr (2>) to a file to capture the stats
$BIN_DIR/compress csr_vlist.bin csr_elist.bin 2> compress_stats.txt
echo "Compression stats:"
cat compress_stats.txt

# --- Step 3: Filter (use default 16) ---
echo "Filtering rules..."
$BIN_DIR/filter csr_vlist.bin csr_elist.bin info.bin 16 > /dev/null 2>&1

# --- Step 4: Data Prep for Ligra ---
echo "Preparing data for Ligra..."
$BIN_DIR/convert2ligra csr_vlist.bin csr_elist.bin > ${GRAPH_NAME}.ligra
$BIN_DIR/save_degree csr_vlist.bin > ${GRAPH_NAME}.degree
$BIN_DIR/gene_rule_order csr_vlist.bin csr_elist.bin info.bin > ${GRAPH_NAME}.order

# --- Step 5: Run Analytics (BFS and PageRank) ---
echo "Running BFS..."
$BIN_DIR/bfs_cpu -r 1 ${GRAPH_NAME}.ligra

echo "Running PageRank..."
$BIN_DIR/pagerank_cpu -maxiters 10 -i info.bin -d ${GRAPH_NAME}.degree -o ${GRAPH_NAME}.order ${GRAPH_NAME}.ligra

echo "--- Finished $GRAPH_NAME ---"
echo "All results are in $RESULTS_DIR"
4 changes: 3 additions & 1 deletion src/apps_ligra/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ target_compile_definitions(topo_cpu PRIVATE ${LIGRA_FLAG})
target_compile_definitions(hits_cpu PRIVATE ${LIGRA_FLAG})



add_executable(triangle_cpu triangle.cpp)
target_compile_definitions(triangle_cpu PRIVATE ${LIGRA_FLAG})
target_include_directories(triangle_cpu PRIVATE ${PROJECT_SOURCE_DIR})
74 changes: 74 additions & 0 deletions src/apps_ligra/triangle.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#include "deps/ligra/ligra/ligra.h"
#include <atomic>

template <class vertex>
void Compute(graph<vertex>& GA, commandLine P) {
long n = GA.n;
std::atomic<long> triangle_count(0);

parallel_for (long u = 0; u < n; u++) {
if (GA.V[u].getOutDegree() == 0) continue;

bool* u_neighbors_set = newA(bool, n);
parallel_for(long i=0; i<n; i++) u_neighbors_set[i] = 0;

struct FillSet_F {
bool* set;
FillSet_F(bool* _set) : set(_set) {}
inline bool update (uintE s, uintE d) { set[d] = 1; return true; }
inline bool updateAtomic (uintE s, uintE d) { set[d] = 1; return true; }
inline bool cond(uintE d) { return true; }
};

vertexSubset u_frontier(n, u);
edgeMap(GA, u_frontier, FillSet_F(u_neighbors_set), n, false);

struct Intersect_F {
long u;
long n;
bool* u_set;
std::atomic<long>& count;
graph<vertex>& GA;

Intersect_F(long _u, long _n, bool* _set, std::atomic<long>& _c, graph<vertex>& _GA)
: u(_u), n(_n), u_set(_set), count(_c), GA(_GA) {}

inline bool update(uintE s, uintE v) {
if (v > u) {

struct Check_Ngh_F {
long v;
bool* u_set;
std::atomic<long>& count;

Check_Ngh_F(long _v, bool* _set, std::atomic<long>& _c)
: v(_v), u_set(_set), count(_c) {}

inline bool update(uintE s, uintE w) {
if (u_set[w]) {
count.fetch_add(1);
}
return true;
}
inline bool updateAtomic(uintE s, uintE w) { return update(s,w); }
inline bool cond(uintE w) { return (w > v); }
};

vertexSubset v_frontier(n, v);
edgeMap(GA, v_frontier, Check_Ngh_F(v, u_set, count), n, false);
}
return true;
}
inline bool updateAtomic(uintE s, uintE v) { return update(s,v); }
inline bool cond(uintE v) { return true; }
};

edgeMap(GA, u_frontier, Intersect_F(u, n, u_neighbors_set, triangle_count, GA), n, false);

u_frontier.del();
free(u_neighbors_set);
}

// We don't need the buggy timer, Ligra prints its own.
cout << "Triangle Count: " << triangle_count.load() << endl;
}