| 1 | from math import * |
|---|
| 2 | |
|---|
| 3 | # function [alpha, xmin, L]=plfit(x, varargin) |
|---|
| 4 | # PLFIT fits a power-law distributional model to data. |
|---|
| 5 | # Source: http://www.santafe.edu/~aaronc/powerlaws/ |
|---|
| 6 | # |
|---|
| 7 | # PLFIT(x) estimates x_min and alpha according to the goodness-of-fit |
|---|
| 8 | # based method described in Clauset, Shalizi, Newman (2007). x is a |
|---|
| 9 | # vector of observations of some quantity to which we wish to fit the |
|---|
| 10 | # power-law distribution p(x) ~ x^-alpha for x >= xmin. |
|---|
| 11 | # PLFIT automatically detects whether x is composed of real or integer |
|---|
| 12 | # values, and applies the appropriate method. For discrete data, if |
|---|
| 13 | # min(x) > 1000, PLFIT uses the continuous approximation, which is |
|---|
| 14 | # a reliable in this regime. |
|---|
| 15 | # |
|---|
| 16 | # The fitting procedure works as follows: |
|---|
| 17 | # 1) For each possible choice of x_min, we estimate alpha via the |
|---|
| 18 | # method of maximum likelihood, and calculate the Kolmogorov-Smirnov |
|---|
| 19 | # goodness-of-fit statistic D. |
|---|
| 20 | # 2) We then select as our estimate of x_min, the value that gives the |
|---|
| 21 | # minimum value D over all values of x_min. |
|---|
| 22 | # |
|---|
| 23 | # Note that this procedure gives no estimate of the uncertainty of the |
|---|
| 24 | # fitted parameters, nor of the validity of the fit. |
|---|
| 25 | # |
|---|
| 26 | # Example: |
|---|
| 27 | # x = [500,150,90,81,75,75,70,65,60,58,49,47,40] |
|---|
| 28 | # [alpha, xmin, L] = plfit(x) |
|---|
| 29 | # or a = plfit(x) |
|---|
| 30 | # |
|---|
| 31 | # The output 'alpha' is the maximum likelihood estimate of the scaling |
|---|
| 32 | # exponent, 'xmin' is the estimate of the lower bound of the power-law |
|---|
| 33 | # behavior, and L is the log-likelihood of the data x>=xmin under the |
|---|
| 34 | # fitted power law. |
|---|
| 35 | # |
|---|
| 36 | # For more information, try 'type plfit' |
|---|
| 37 | # |
|---|
| 38 | # See also PLVAR, PLPVA |
|---|
| 39 | |
|---|
| 40 | # Version 1.0.10 (2010 January) |
|---|
| 41 | # Copyright (C) 2008-2011 Aaron Clauset (Santa Fe Institute) |
|---|
| 42 | |
|---|
| 43 | # Ported to Python by Joel Ornstein (2011 July) |
|---|
| 44 | # (joel_ornstein@hmc.edu) |
|---|
| 45 | |
|---|
| 46 | # Distributed under GPL 2.0 |
|---|
| 47 | # http://www.gnu.org/copyleft/gpl.html |
|---|
| 48 | # PLFIT comes with ABSOLUTELY NO WARRANTY |
|---|
| 49 | # |
|---|
| 50 | # |
|---|
| 51 | # The 'zeta' helper function is modified from the open-source library 'mpmath' |
|---|
| 52 | # mpmath: a Python library for arbitrary-precision floating-point arithmetic |
|---|
| 53 | # http://code.google.com/p/mpmath/ |
|---|
| 54 | # version 0.17 (February 2011) by Fredrik Johansson and others |
|---|
| 55 | # |
|---|
| 56 | |
|---|
| 57 | # Notes: |
|---|
| 58 | # |
|---|
| 59 | # 1. In order to implement the integer-based methods in Matlab, the numeric |
|---|
| 60 | # maximization of the log-likelihood function was used. This requires |
|---|
| 61 | # that we specify the range of scaling parameters considered. We set |
|---|
| 62 | # this range to be 1.50 to 3.50 at 0.01 intervals by default. |
|---|
| 63 | # This range can be set by the user like so, |
|---|
| 64 | # |
|---|
| 65 | # a = plfit(x,'range',[1.50,3.50,0.01]) |
|---|
| 66 | # |
|---|
| 67 | # 2. PLFIT can be told to limit the range of values considered as estimates |
|---|
| 68 | # for xmin in three ways. First, it can be instructed to sample these |
|---|
| 69 | # possible values like so, |
|---|
| 70 | # |
|---|
| 71 | # a = plfit(x,'sample',100) |
|---|
| 72 | # |
|---|
| 73 | # which uses 100 uniformly distributed values on the sorted list of |
|---|
| 74 | # unique values in the data set. Second, it can simply omit all |
|---|
| 75 | # candidates above a hard limit, like so |
|---|
| 76 | # |
|---|
| 77 | # a = plfit(x,'limit',3.4) |
|---|
| 78 | # |
|---|
| 79 | # Finally, it can be forced to use a fixed value, like so |
|---|
| 80 | # |
|---|
| 81 | # a = plfit(x,'xmin',3.4) |
|---|
| 82 | # |
|---|
| 83 | # In the case of discrete data, it rounds the limit to the nearest |
|---|
| 84 | # integer. |
|---|
| 85 | # |
|---|
| 86 | # 3. When the input sample size is small (e.g., < 100), the continuous |
|---|
| 87 | # estimator is slightly biased (toward larger values of alpha). To |
|---|
| 88 | # explicitly use an experimental finite-size correction, call PLFIT like |
|---|
| 89 | # so |
|---|
| 90 | # |
|---|
| 91 | # a = plfit(x,'finite') |
|---|
| 92 | # |
|---|
| 93 | # which does a small-size correction to alpha. |
|---|
| 94 | # |
|---|
| 95 | # 4. For continuous data, PLFIT can return erroneously large estimates of |
|---|
| 96 | # alpha when xmin is so large that the number of obs x >= xmin is very |
|---|
| 97 | # small. To prevent this, we can truncate the search over xmin values |
|---|
| 98 | # before the finite-size bias becomes significant by calling PLFIT as |
|---|
| 99 | # |
|---|
| 100 | # a = plfit(x,'nosmall') |
|---|
| 101 | # |
|---|
| 102 | # which skips values xmin with finite size bias > 0.1. |
|---|
| 103 | |
|---|
| 104 | def plfit(x, *varargin): |
|---|
| 105 | vec = [] |
|---|
| 106 | sample = [] |
|---|
| 107 | xminx = [] |
|---|
| 108 | limit = [] |
|---|
| 109 | finite = False |
|---|
| 110 | nosmall = False |
|---|
| 111 | nowarn = False |
|---|
| 112 | |
|---|
| 113 | # parse command-line parameters trap for bad input |
|---|
| 114 | i=0 |
|---|
| 115 | while i<len(varargin): |
|---|
| 116 | argok = 1 |
|---|
| 117 | if type(varargin[i])==str: |
|---|
| 118 | if varargin[i]=='range': |
|---|
| 119 | Range = varargin[i+1] |
|---|
| 120 | if Range[1]>Range[0]: |
|---|
| 121 | argok=0 |
|---|
| 122 | vec=[] |
|---|
| 123 | try: |
|---|
| 124 | vec=map(lambda X:X*float(Range[2])+Range[0],\ |
|---|
| 125 | range(int((Range[1]-Range[0])/Range[2]))) |
|---|
| 126 | |
|---|
| 127 | |
|---|
| 128 | except: |
|---|
| 129 | argok=0 |
|---|
| 130 | vec=[] |
|---|
| 131 | |
|---|
| 132 | |
|---|
| 133 | if Range[0]>=Range[1]: |
|---|
| 134 | argok=0 |
|---|
| 135 | vec=[] |
|---|
| 136 | i-=1 |
|---|
| 137 | |
|---|
| 138 | i+=1 |
|---|
| 139 | |
|---|
| 140 | |
|---|
| 141 | elif varargin[i]== 'sample': |
|---|
| 142 | sample = varargin[i+1] |
|---|
| 143 | i = i + 1 |
|---|
| 144 | elif varargin[i]== 'limit': |
|---|
| 145 | limit = varargin[i+1] |
|---|
| 146 | i = i + 1 |
|---|
| 147 | elif varargin[i]== 'xmin': |
|---|
| 148 | xminx = varargin[i+1] |
|---|
| 149 | i = i + 1 |
|---|
| 150 | elif varargin[i]== 'finite': finite = True |
|---|
| 151 | elif varargin[i]== 'nowarn': nowarn = True |
|---|
| 152 | elif varargin[i]== 'nosmall': nosmall = True |
|---|
| 153 | else: argok=0 |
|---|
| 154 | |
|---|
| 155 | |
|---|
| 156 | if not argok: |
|---|
| 157 | print '(PLFIT) Ignoring invalid argument #',i+1 |
|---|
| 158 | |
|---|
| 159 | i = i+1 |
|---|
| 160 | |
|---|
| 161 | if vec!=[] and (type(vec)!=list or min(vec)<=1): |
|---|
| 162 | print '(PLFIT) Error: ''range'' argument must contain a vector or minimum <= 1. using default.\n' |
|---|
| 163 | |
|---|
| 164 | vec = [] |
|---|
| 165 | |
|---|
| 166 | if sample!=[] and sample<2: |
|---|
| 167 | print'(PLFIT) Error: ''sample'' argument must be a positive integer > 1. using default.\n' |
|---|
| 168 | sample = [] |
|---|
| 169 | |
|---|
| 170 | if limit!=[] and limit<min(x): |
|---|
| 171 | print'(PLFIT) Error: ''limit'' argument must be a positive value >= 1. using default.\n' |
|---|
| 172 | limit = [] |
|---|
| 173 | |
|---|
| 174 | if xminx!=[] and xminx>=max(x): |
|---|
| 175 | print'(PLFIT) Error: ''xmin'' argument must be a positive value < max(x). using default behavior.\n' |
|---|
| 176 | xminx = [] |
|---|
| 177 | |
|---|
| 178 | |
|---|
| 179 | |
|---|
| 180 | # select method (discrete or continuous) for fitting |
|---|
| 181 | if reduce(lambda X,Y:X==True and floor(Y)==float(Y),x,True): f_dattype = 'INTS' |
|---|
| 182 | elif reduce(lambda X,Y:X==True and (type(Y)==int or type(Y)==float or type(Y)==long),x,True): f_dattype = 'REAL' |
|---|
| 183 | #else: f_dattype = 'UNKN' |
|---|
| 184 | ##AS |
|---|
| 185 | else: f_dattype = 'REAL' |
|---|
| 186 | |
|---|
| 187 | if f_dattype=='INTS' and min(x) > 1000 and len(x)>100: |
|---|
| 188 | f_dattype = 'REAL' |
|---|
| 189 | |
|---|
| 190 | print f_dattype |
|---|
| 191 | |
|---|
| 192 | # estimate xmin and alpha, accordingly |
|---|
| 193 | |
|---|
| 194 | if f_dattype== 'REAL': |
|---|
| 195 | xmins = unique(x) |
|---|
| 196 | xmins.sort() |
|---|
| 197 | xmins = xmins[0:-1] |
|---|
| 198 | if xminx!=[]: |
|---|
| 199 | |
|---|
| 200 | xmins = [min(filter(lambda X: X>=xminx,xmins))] |
|---|
| 201 | |
|---|
| 202 | |
|---|
| 203 | if limit!=[]: |
|---|
| 204 | xmins=filter(lambda X: X<=limit,xmins) |
|---|
| 205 | if xmins==[]: xmins = [min(x)] |
|---|
| 206 | |
|---|
| 207 | if sample!=[]: |
|---|
| 208 | step = float(len(xmins))/(sample-1) |
|---|
| 209 | index_curr=0 |
|---|
| 210 | new_xmins=[] |
|---|
| 211 | for i in range (0,sample): |
|---|
| 212 | if round(index_curr)==len(xmins): index_curr-=1 |
|---|
| 213 | new_xmins.append(xmins[int(round(index_curr))]) |
|---|
| 214 | index_curr+=step |
|---|
| 215 | xmins = unique(new_xmins) |
|---|
| 216 | xmins.sort() |
|---|
| 217 | |
|---|
| 218 | |
|---|
| 219 | |
|---|
| 220 | dat = [] |
|---|
| 221 | z = sorted(x) |
|---|
| 222 | |
|---|
| 223 | for xm in range(0,len(xmins)): |
|---|
| 224 | xmin = xmins[xm] |
|---|
| 225 | z = filter(lambda X:X>=xmin,z) |
|---|
| 226 | |
|---|
| 227 | n = len(z) |
|---|
| 228 | # estimate alpha using direct MLE |
|---|
| 229 | |
|---|
| 230 | a = float(n) / sum(map(lambda X: log(float(X)/xmin),z)) |
|---|
| 231 | if nosmall: |
|---|
| 232 | if (a-1)/sqrt(n) > 0.1 and dat!=[]: |
|---|
| 233 | xm = len(xmins)+1 |
|---|
| 234 | break |
|---|
| 235 | |
|---|
| 236 | |
|---|
| 237 | # compute KS statistic |
|---|
| 238 | #cx = map(lambda X:float(X)/n,range(0,n)) |
|---|
| 239 | cf = map(lambda X:1-pow((float(xmin)/X),a),z) |
|---|
| 240 | dat.append( max( map(lambda X: abs(cf[X]-float(X)/n),range(0,n)))) |
|---|
| 241 | D = min(dat) |
|---|
| 242 | xmin = xmins[dat.index(D)] |
|---|
| 243 | z = filter(lambda X:X>=xmin,x) |
|---|
| 244 | z.sort() |
|---|
| 245 | n = len(z) |
|---|
| 246 | alpha = 1 + n / sum(map(lambda X: log(float(X)/xmin),z)) |
|---|
| 247 | if finite: alpha = alpha*float(n-1)/n+1./n # finite-size correction |
|---|
| 248 | if n < 50 and not finite and not nowarn: |
|---|
| 249 | print '(PLFIT) Warning: finite-size bias may be present.\n' |
|---|
| 250 | |
|---|
| 251 | L = n*log((alpha-1)/xmin) - alpha*sum(map(lambda X: log(float(X)/xmin),z)) |
|---|
| 252 | elif f_dattype== 'INTS': |
|---|
| 253 | |
|---|
| 254 | x=map(int,x) |
|---|
| 255 | if vec==[]: |
|---|
| 256 | for X in range(150,351): |
|---|
| 257 | vec.append(X/100.) # covers range of most practical |
|---|
| 258 | # scaling parameters |
|---|
| 259 | zvec = map(zeta, vec) |
|---|
| 260 | |
|---|
| 261 | xmins = unique(x) |
|---|
| 262 | xmins.sort() |
|---|
| 263 | xmins = xmins[0:-1] |
|---|
| 264 | if xminx!=[]: |
|---|
| 265 | xmins = [min(filter(lambda X: X>=xminx,xmins))] |
|---|
| 266 | |
|---|
| 267 | if limit!=[]: |
|---|
| 268 | limit = round(limit) |
|---|
| 269 | xmins=filter(lambda X: X<=limit,xmins) |
|---|
| 270 | if xmins==[]: xmins = [min(x)] |
|---|
| 271 | |
|---|
| 272 | if sample!=[]: |
|---|
| 273 | step = float(len(xmins))/(sample-1) |
|---|
| 274 | index_curr=0 |
|---|
| 275 | new_xmins=[] |
|---|
| 276 | for i in range (0,sample): |
|---|
| 277 | if round(index_curr)==len(xmins): index_curr-=1 |
|---|
| 278 | new_xmins.append(xmins[int(round(index_curr))]) |
|---|
| 279 | index_curr+=step |
|---|
| 280 | xmins = unique(new_xmins) |
|---|
| 281 | xmins.sort() |
|---|
| 282 | |
|---|
| 283 | if xmins==[]: |
|---|
| 284 | print '(PLFIT) Error: x must contain at least two unique values.\n' |
|---|
| 285 | alpha = 'Not a Number' |
|---|
| 286 | xmin = x[0] |
|---|
| 287 | D = 'Not a Number' |
|---|
| 288 | return [alpha,xmin,D] |
|---|
| 289 | |
|---|
| 290 | xmax = max(x) |
|---|
| 291 | |
|---|
| 292 | z = x |
|---|
| 293 | z.sort() |
|---|
| 294 | datA=[] |
|---|
| 295 | datB=[] |
|---|
| 296 | |
|---|
| 297 | for xm in range(0,len(xmins)): |
|---|
| 298 | xmin = xmins[xm] |
|---|
| 299 | z = filter(lambda X:X>=xmin,z) |
|---|
| 300 | n = len(z) |
|---|
| 301 | # estimate alpha via direct maximization of likelihood function |
|---|
| 302 | |
|---|
| 303 | # force iterative calculation |
|---|
| 304 | L = [] |
|---|
| 305 | slogz = sum(map(log,z)) |
|---|
| 306 | xminvec = map(float,range(1,xmin)) |
|---|
| 307 | for k in range(0,len(vec)): |
|---|
| 308 | L.append(-vec[k]*float(slogz) - float(n)*log(float(zvec[k]) - sum(map(lambda X:pow(float(X),-vec[k]),xminvec)))) |
|---|
| 309 | |
|---|
| 310 | |
|---|
| 311 | I = L.index(max(L)) |
|---|
| 312 | # compute KS statistic |
|---|
| 313 | fit = reduce(lambda X,Y: X+[Y+X[-1]],\ |
|---|
| 314 | (map(lambda X: pow(X,-vec[I])/(float(zvec[I])-sum(map(lambda X: pow(X,-vec[I]),map(float,range(1,xmin))))),range(xmin,xmax+1))),[0])[1:] |
|---|
| 315 | cdi=[] |
|---|
| 316 | for XM in range(xmin,xmax+1): |
|---|
| 317 | cdi.append(len(filter(lambda X: floor(X)<=XM,z))/float(n)) |
|---|
| 318 | |
|---|
| 319 | datA.append(max( map(lambda X: abs(fit[X] - cdi[X]),range(0,xmax-xmin+1)))) |
|---|
| 320 | datB.append(vec[I]) |
|---|
| 321 | # select the index for the minimum value of D |
|---|
| 322 | I = datA.index(min(datA)) |
|---|
| 323 | xmin = xmins[I] |
|---|
| 324 | z = filter(lambda X:X>=xmin,x) |
|---|
| 325 | n = len(z) |
|---|
| 326 | alpha = datB[I] |
|---|
| 327 | if finite: alpha = alpha*(n-1.)/n+1./n # finite-size correction |
|---|
| 328 | if n < 50 and not finite and not nowarn: |
|---|
| 329 | print '(PLFIT) Warning: finite-size bias may be present.\n' |
|---|
| 330 | |
|---|
| 331 | L = -alpha*sum(map(log,z)) - n*log(zvec[vec.index(max(filter(lambda X:X<=alpha,vec)))] - \ |
|---|
| 332 | sum(map(lambda X: pow(X,-alpha),range(1,xmin)))) |
|---|
| 333 | else: |
|---|
| 334 | print '(PLFIT) Error: x must contain only reals or only integers.\n' |
|---|
| 335 | alpha = [] |
|---|
| 336 | xmin = [] |
|---|
| 337 | L = [] |
|---|
| 338 | |
|---|
| 339 | return [alpha,xmin,L] |
|---|
| 340 | |
|---|
| 341 | |
|---|
| 342 | # helper functions (unique and zeta) |
|---|
| 343 | |
|---|
| 344 | |
|---|
| 345 | def unique(seq): |
|---|
| 346 | # not order preserving |
|---|
| 347 | set = {} |
|---|
| 348 | map(set.__setitem__, seq, []) |
|---|
| 349 | return set.keys() |
|---|
| 350 | |
|---|
| 351 | def _polyval(coeffs, x): |
|---|
| 352 | p = coeffs[0] |
|---|
| 353 | for c in coeffs[1:]: |
|---|
| 354 | p = c + x*p |
|---|
| 355 | return p |
|---|
| 356 | |
|---|
| 357 | _zeta_int = [\ |
|---|
| 358 | -0.5, |
|---|
| 359 | 0.0, |
|---|
| 360 | 1.6449340668482264365,1.2020569031595942854,1.0823232337111381915, |
|---|
| 361 | 1.0369277551433699263,1.0173430619844491397,1.0083492773819228268, |
|---|
| 362 | 1.0040773561979443394,1.0020083928260822144,1.0009945751278180853, |
|---|
| 363 | 1.0004941886041194646,1.0002460865533080483,1.0001227133475784891, |
|---|
| 364 | 1.0000612481350587048,1.0000305882363070205,1.0000152822594086519, |
|---|
| 365 | 1.0000076371976378998,1.0000038172932649998,1.0000019082127165539, |
|---|
| 366 | 1.0000009539620338728,1.0000004769329867878,1.0000002384505027277, |
|---|
| 367 | 1.0000001192199259653,1.0000000596081890513,1.0000000298035035147, |
|---|
| 368 | 1.0000000149015548284] |
|---|
| 369 | |
|---|
| 370 | _zeta_P = [-3.50000000087575873, -0.701274355654678147, |
|---|
| 371 | -0.0672313458590012612, -0.00398731457954257841, |
|---|
| 372 | -0.000160948723019303141, -4.67633010038383371e-6, |
|---|
| 373 | -1.02078104417700585e-7, -1.68030037095896287e-9, |
|---|
| 374 | -1.85231868742346722e-11][::-1] |
|---|
| 375 | |
|---|
| 376 | _zeta_Q = [1.00000000000000000, -0.936552848762465319, |
|---|
| 377 | -0.0588835413263763741, -0.00441498861482948666, |
|---|
| 378 | -0.000143416758067432622, -5.10691659585090782e-6, |
|---|
| 379 | -9.58813053268913799e-8, -1.72963791443181972e-9, |
|---|
| 380 | -1.83527919681474132e-11][::-1] |
|---|
| 381 | |
|---|
| 382 | _zeta_1 = [3.03768838606128127e-10, -1.21924525236601262e-8, |
|---|
| 383 | 2.01201845887608893e-7, -1.53917240683468381e-6, |
|---|
| 384 | -5.09890411005967954e-7, 0.000122464707271619326, |
|---|
| 385 | -0.000905721539353130232, -0.00239315326074843037, |
|---|
| 386 | 0.084239750013159168, 0.418938517907442414, 0.500000001921884009] |
|---|
| 387 | |
|---|
| 388 | _zeta_0 = [-3.46092485016748794e-10, -6.42610089468292485e-9, |
|---|
| 389 | 1.76409071536679773e-7, -1.47141263991560698e-6, -6.38880222546167613e-7, |
|---|
| 390 | 0.000122641099800668209, -0.000905894913516772796, -0.00239303348507992713, |
|---|
| 391 | 0.0842396947501199816, 0.418938533204660256, 0.500000000000000052] |
|---|
| 392 | |
|---|
| 393 | def zeta(s): |
|---|
| 394 | """ |
|---|
| 395 | Riemann zeta function, real argument |
|---|
| 396 | """ |
|---|
| 397 | if not isinstance(s, (float, int)): |
|---|
| 398 | try: |
|---|
| 399 | s = float(s) |
|---|
| 400 | except (ValueError, TypeError): |
|---|
| 401 | try: |
|---|
| 402 | s = complex(s) |
|---|
| 403 | if not s.imag: |
|---|
| 404 | return complex(zeta(s.real)) |
|---|
| 405 | except (ValueError, TypeError): |
|---|
| 406 | pass |
|---|
| 407 | raise NotImplementedError |
|---|
| 408 | if s == 1: |
|---|
| 409 | raise ValueError("zeta(1) pole") |
|---|
| 410 | if s >= 27: |
|---|
| 411 | return 1.0 + 2.0**(-s) + 3.0**(-s) |
|---|
| 412 | n = int(s) |
|---|
| 413 | if n == s: |
|---|
| 414 | if n >= 0: |
|---|
| 415 | return _zeta_int[n] |
|---|
| 416 | if not (n % 2): |
|---|
| 417 | return 0.0 |
|---|
| 418 | if s <= 0.0: |
|---|
| 419 | return 0 |
|---|
| 420 | if s <= 2.0: |
|---|
| 421 | if s <= 1.0: |
|---|
| 422 | return _polyval(_zeta_0,s)/(s-1) |
|---|
| 423 | return _polyval(_zeta_1,s)/(s-1) |
|---|
| 424 | z = _polyval(_zeta_P,s) / _polyval(_zeta_Q,s) |
|---|
| 425 | return 1.0 + 2.0**(-s) + 3.0**(-s) + 4.0**(-s)*z |
|---|
| 426 | |
|---|
| 427 | |
|---|
| 428 | |
|---|
| 429 | |
|---|