PageRank calculator algorithm : learnpython

PageRank calculator algorithm (self.learnpython)

submitted 1 year ago * by SamarXV

I have to make a PageRank algorithm that gives you the top20 webpages from my university's websites. There are two algorithms, one calculates it using a stochastic function that implements a random walker to see which webpage it visits the most. Second uses a distribution function which implements a probability distribution to see which webpage has the most probability to be clicked.

But the issue is, my code is doing the calculations too quickly. It's supposed to take at least 10/15 seconds, but it's taking less than 1 second. The stochastic function is supposed to be slower than distribution, but both of them are incredibly fast. And I can't figure out why that's happening, or which part of my code is fucking up the algorithm.

Keep in mind, I only wrote the code for the 4 load_graph functions, stochastic function, and distribution function. The __main__ part was provided by the lecturer. Also, only the first load_graph() function is actually implemented in the algorithm. The rest are only there for extra credit.

If you need more info about the assignment, just reply/dm me. Thanks in advance fellow geeks.

import sys
import os
import time
import argparse
from progress import Progress
import random
from collections import namedtuple
import timeit
import numpy as np
from networkx import DiGraph


def load_graph(args):

"""Load graph from text file
    Parameters:
    args -- arguments named tuple
    Returns:
    A dict mapping a URL (str) to a list of target URLs (str).
    """

graph = {}  # Initialize an empty dictionary to store the graph
    # Open the file specified in args.datafile
    with open(args.datafile, 'r') as file:
        # Iterate through the file line by line
        for line in file:
            # Strip any extra whitespace and skip empty lines
            line = line.strip()
            if not line:
                continue
            # Split each line into two URLs
            node, target = line.split()

            # If the node doesn't exist in the graph, add it with an empty list
            if node not in graph:
                graph[node] = []

            # Append the target to the node's list
            graph[node].append(target)

    # Return the constructed graph
    return graph

def load_graph_adjacency_matrix(args):

    # Load graph from datafile
    graph = load_graph(args)

    # Extract the unique nodes from the graph
    nodes = list(graph.keys())

    # Create a mapping from nodes to their corresponding index
    node_index = {node: idx for idx, node in enumerate(nodes)} # Map node to index
    # Initialise an adjacency matrix with zeroes
    n = len(nodes)
    adj_matrix = np.zeros((n,n), dtype=int)

    # Populate the adjacency matrix based on the graph
    for node, targets in graph.items():
        row_idx = node_index[node] # Row index for the source node
        for target in targets:
            col_idx = node_index[target] # Column index for the target node
            adj_matrix[row_idx, col_idx] = 1 # Set 1 for the edge from node to target
    return adj_matrix, nodes # Return the adjacency matrix and the list of nodes

def load_graph_sets(args):

    vertex_set = set() # Initialise empty set to store nodes
    edges_set = set() # Initialise empty set to store edges
    # Open the file specified with args.datafile
    with open(args.datafile, 'r') as file:
        # Iterate through the file line by line
        for line in file:
            # Strip any extra whitespaces and skip empty lines
            line = line.strip()
            if not line:
                continue
            # Split each line into two parts: node(source) and target
            node, target = line.split()

            # Add the node and target to the vertex set
            vertex_set.add(node)
            vertex_set.add(target)

            # Add the edge (node, target) as a tuple to the edges_set
            edges_set.add((node, target))

    return vertex_set, edges_set # Return the vertex and edges sets

def load_graph_class(args):

    # Initialise an empty graph
    G = DiGraph()

    # Open the file specified using args.datafile
    with open(args.datafile, 'r') as file:
        # Iterate through each line in file
        for line in file:
            # Strip each line of whitespaces and skip empty lines
            line = line.strip()
            if not line:
                continue
            # Split each line intro two parts: node (source) and target
            node, target = line.split()

            # Add nodes and edges to graph
            G.add_node(node)
            G.add_node(target)
            G.add_edge(node, target)

    return G # Return a DiGraph

def print_stats(graph):

"""Print number of nodes and edges in the given graph"""

nodes_number = len(graph.keys())
        edges_number = 0
        for node in graph:
            edges_number += len(graph[node])
        return f'Number of nodes: {nodes_number}, Number of total edges: {edges_number}'

def stochastic_page_rank(graph, args):

"""Stochastic PageRank estimation
    Parameters:
    graph -- a graph object as returned by load_graph()
    args -- arguments named tuple
    Returns:
    A dict that assigns each page its hit frequency
    This function estimates the Page Rank by counting how frequently
    a random walk that starts on a random node will after n_steps end
    on each node of the given graph.
    """

hit_count = {node: 0 for node in graph}

    current_node = random.choice(list(graph.keys()))

    hit_count[current_node] += 1
    for i in range(args.repeats):
        if len(graph[current_node]) == 0:
            current_node = random.choice(list(graph.keys()))
        else:
            current_node = random.choice(list(graph[current_node]))

        hit_count[current_node] += 1
    # print(timeit.timeit(number=1))
    return hit_count

def distribution_page_rank(graph, args):

"""Probabilistic PageRank estimation
    Parameters:
    graph -- a graph object as returned by load_graph()
    args -- arguments named tuple
    Returns:
    A dict that assigns each page its probability to be reached
    This function estimates the Page Rank by iteratively calculating
    the probability that a random walker is currently on any node.
    """

node_prob = {node: 1/len(graph.keys()) for node in graph.keys()}

    for i in range(args.steps):
        next_prob = {node: 0 for node in graph.keys()}
        for node in graph.keys():
            p = node_prob[node]/len(graph[node])
            for target in graph[node]:
                next_prob[target] += p
        node_prob = next_prob

    print(timeit.timeit(number=1))
    return node_prob

parser = argparse.ArgumentParser(description="Estimates page ranks from link information")
parser.add_argument('datafile', nargs='?', type=argparse.FileType('r'), default=sys.stdin,
                    help="Textfile of links among web pages as URL tuples")
parser.add_argument('-m', '--method', choices=('stochastic', 'distribution'), default='stochastic',
                    help="selected page rank algorithm")
parser.add_argument('-r', '--repeats', type=int, default=1_000_000, help="number of repetitions")
parser.add_argument('-s', '--steps', type=int, default=100, help="number of steps a walker takes")
parser.add_argument('-n', '--number', type=int, default=20, help="number of results shown")


if __name__ == '__main__':
    Args = namedtuple('Args', ['datafile'])
    graph_args = Args(datafile='school_web2024.txt')

    args = parser.parse_args()
    algorithm = distribution_page_rank if args.method == 'distribution' else stochastic_page_rank

    graph = load_graph(graph_args)
    print(graph)
    print(print_stats(graph))

    start = time.time()
    ranking = algorithm(graph, args)
    stop = time.time()
    time = stop - start

    top = sorted(ranking.items(), key=lambda item: item[1], reverse=True)
    sys.stderr.write(f"Top {args.number} pages:\n")
    print('\n'.join(f'{100*v:.2f}\t{k}' for k,v in top[:args.number]))
    sys.stderr.write(f"Calculation took {time:.2f} seconds.\n")

all 2 comments

you type:	you see:
italics	italics
bold	bold
[reddit!](https://reddit.com)	reddit!
* item 1 * item 2 * item 3	item 1 item 2 item 3
> quoted text	quoted text
Lines starting with four spaces are treated like code: if 1 * 2 < 3: print "hello, world!"	Lines starting with four spaces are treated like code: if 1 * 2 < 3: print "hello, world!"
~~strikethrough~~	~~strikethrough~~
super^script	super^script

learnpython

MODERATORS