Skip to main content

Command Palette

Search for a command to run...

分布模型與排名估計分析

Updated

1. 核心問題

給定一個具體數值佔比,如何通過不同的(離散)分布模型來估計其可能的排名位置

2. 核心流程

  1. 確定模型與參數

    • 實現模型分布

    • 設置模型參數(α, β 等)

  2. 計算各排序數值佔比

    • 計算各排名位置的數值

    • 將數值轉換為數值佔比

  3. 尋找目標排名

    • 尋找最接近目標數值佔比的排名數值佔比

    • 評估估計誤差範圍

3. 模型比較

模型核心參數分布特征主要優點限制
Power Lawα:幂律指數- 嚴格的長尾特性
- 線性衰減(log尺度)
- 能準確描述極端不平等
- 易於理解和解釋
- 無法描述複雜的分布變化
- 在中間區域擬合度較差
Stretched Exponentialβ:拉伸參數
x₀:特徵尺度
- 較快的初始衰減
- 溫和的尾部
- 可描述漸進轉變
- 平滑的分布形態
- 難以捕捉突變點
- 參數物理含義不直觀

4. 實現代碼

import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import List, Dict, Tuple

@dataclass
class DPLNParams:
    alpha: float
    mu: float
    sigma: float
    label: str

class DistributionModels:
    @staticmethod
    def power_law(x: np.ndarray, alpha: float) -> np.ndarray:
        """Power law distribution for value percentages"""
        values = (x.astype(float)) ** (-alpha)
        return values * 100 / np.sum(values)  # Convert to percentage

    @staticmethod
    def stretched_exponential(x: np.ndarray, beta: float, x0: float = 1) -> np.ndarray:
        """Stretched exponential distribution for value percentages"""
        values = np.exp(-(x/x0) ** beta)
        return values * 100 / np.sum(values)  # Convert to percentage

class DistributionAnalyzer:
    def __init__(self, target_percentage: float, N: int = 10000, max_rank: int = 200):
        self.target_percentage = target_percentage
        self.N = N
        self.max_rank = max_rank
        self.ranks = np.arange(1, max_rank + 1)

        self.models = DistributionModels()

    def calculate_distribution(self,
                             distribution_func: callable,
                             params: dict,
                             ranks: np.ndarray = None) -> np.ndarray:
        """Calculate value percentage distribution"""
        if ranks is None:
            ranks = self.ranks
        return distribution_func(ranks, **params)

    def find_closest_rank(self,
                         target_percentage: float,
                         distribution_func: callable,
                         params: dict) -> Tuple[int, float, float]:
        """
        Find the rank closest to target percentage

        Returns:
            Tuple[int, float, float]: (rank, actual_percentage, error)
        """
        percentages = self.calculate_distribution(distribution_func, params)
        differences = np.abs(percentages - target_percentage)
        best_rank_idx = np.argmin(differences)

        return (
            best_rank_idx + 1,  # rank
            percentages[best_rank_idx],  # actual percentage
            differences[best_rank_idx]  # error
        )

    def create_visualization(self,
                           alphas: List[float],
                           betas: List[float],
                           figsize: Tuple[int, int] = (15, 6)) -> Tuple[plt.Figure, Dict]:
        """
        Create visualization and return analysis results

        Returns:
            Tuple[plt.Figure, Dict]: (figure, results_dictionary)
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
        results = {'power_law': [], 'stretched_exp': []}

        # Power Law analysis and plot
        for alpha in alphas:
            rank, actual, error = self.find_closest_rank(
                self.target_percentage,
                self.models.power_law,
                {'alpha': alpha}
            )
            results['power_law'].append({
                'alpha': alpha,
                'rank': rank,
                'actual_percentage': actual,
                'error': error
            })

            percentages = self.calculate_distribution(
                self.models.power_law, {'alpha': alpha}
            )
            ax1.scatter(self.ranks, percentages, alpha=0.6, label=f'α={alpha}')

        self._format_axis(ax1, 'Power Law')

        # Stretched Exponential analysis and plot
        for beta in betas:
            rank, actual, error = self.find_closest_rank(
                self.target_percentage,
                self.models.stretched_exponential,
                {'beta': beta}
            )
            results['stretched_exp'].append({
                'beta': beta,
                'rank': rank,
                'actual_percentage': actual,
                'error': error
            })

            percentages = self.calculate_distribution(
                self.models.stretched_exponential, {'beta': beta}
            )
            ax2.scatter(self.ranks, percentages, alpha=0.6, label=f'β={beta}')

        self._format_axis(ax2, 'Stretched Exponential')

        plt.tight_layout()
        return fig, results

    def _format_axis(self, ax: plt.Axes, title: str):
        """Format plot axis with common settings"""
        ax.axhline(y=self.target_percentage, color='r',
                  linestyle='--', label=f'target ({self.target_percentage:.2f}%)')
        ax.set_title(title)
        ax.set_xlabel('Rank')
        ax.set_ylabel('Value Percentage')
        ax.set_yscale('log')
        ax.set_xlim(0, self.max_rank)
        ax.grid(True)
        ax.legend()

# Example usage
if __name__ == "__main__":
    # Parameters
    alphas = [1.0, 1.2, 1.4]
    betas = [0.4, 0.5, 0.6]

    # Create analyzer and generate plots with results
    analyzer = DistributionAnalyzer(0.67)
    fig, results = analyzer.create_visualization(alphas, betas)

    # Print analysis results
    print("\nPower Law Results:")
    for result in results['power_law']:
        print(f"α={result['alpha']:.1f}: "
              f"Rank={result['rank']}, "
              f"Actual={result['actual_percentage']:.2f}%, "
              f"Error={result['error']:.2f}%")

    print("\nStretched Exponential Results:")
    for result in results['stretched_exp']:
        print(f"β={result['beta']:.1f}: "
              f"Rank={result['rank']}, "
              f"Actual={result['actual_percentage']:.2f}%, "
              f"Error={result['error']:.2f}%")

    plt.show()