Normalize any distribution into a Gaussian distribution (正規分布)

  1. Rank-transform the data
  2. Normalize to [-1 1]
  3. Take inverse hyperbolic tangent (双曲線正接).

$y = 2^x$

$\quad x \backsim N(\mu=0,\sigma^2=1)$

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import rankdata
In [2]:
n = 500

orig_data = 2**np.random.randn(n)

rank_data = rankdata(orig_data) # re-ordering
rank_data = np.append(rank_data,np.max(rank_data)+1) # add one more val to exclude infinity
# print(rank_data)

norm_data = 2*(rank_data/np.max(rank_data))-1 # normalize between -1~1
# print(norm_data)

norm_data = np.delete(norm_data,n) # delete the last val (1) to exclude the infinity
# print(norm_data)

trans_data = np.arctanh(norm_data) # -1 or 1 returns infinity

fig,ax = plt.subplots(1,2)

ax[0].hist(orig_data,30)
ax[0].set_xlabel('Value')
ax[0].set_ylabel('Count')

ax[1].hist(trans_data,30)
ax[1].set_xlabel('Value')
ax[1].set_ylabel('Count')

plt.show()