From bef2d4949ebc3a097bceccc503bc6cf711555623 Mon Sep 17 00:00:00 2001 From: Nicolas Mounet Date: Tue, 1 Jun 2021 21:39:01 +0200 Subject: [PATCH 1/7] Adding tests on different kind of sampleable distributions --- cara/tests/test_sampleable_distribution.py | 81 ++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 cara/tests/test_sampleable_distribution.py diff --git a/cara/tests/test_sampleable_distribution.py b/cara/tests/test_sampleable_distribution.py new file mode 100644 index 00000000..b53f9ad6 --- /dev/null +++ b/cara/tests/test_sampleable_distribution.py @@ -0,0 +1,81 @@ +import numpy as np +import numpy.testing as npt +import pytest + +from cara.monte_carlo import sampleable + +@pytest.mark.parametrize( + "mean, std",[ + [1., 0.5], + ] +) +def test_normal(mean, std): + # test that the sample has approximately the right mean, + # std deviation and distribution function. + sample_size = 2000000 + samples = sampleable.Normal(mean, std).generate_samples(sample_size) + histogram, bins = np.histogram(samples,bins=100, density=True) + x = (bins[1:]+bins[:-1])/2 + exact_dist = 1/(np.sqrt(2*np.pi)*std) * np.exp(-((x-mean)/std)**2/2) + + assert len(samples) == sample_size + npt.assert_allclose([samples.mean(), samples.std()], [mean, std], atol=mean/100) + npt.assert_allclose(histogram, exact_dist, atol=exact_dist.mean()/20) + + +@pytest.mark.parametrize( + "mean_gaussian, std_gaussian",[ + [-0.6872121723362303, 0.10498338229297108], + ] +) +def test_lognormal(mean_gaussian, std_gaussian): + # test that the sample has approximately the right mean, + # std deviation and distribution function. + sample_size = 2000000 + samples = sampleable.LogNormal(mean_gaussian, std_gaussian + ).generate_samples(sample_size) + histogram, bins = np.histogram(samples,bins=50, density=True) + x = (bins[1:]+bins[:-1])/2 + exact_dist = ( 1/(x*np.sqrt(2*np.pi)*std_gaussian) * + np.exp(-((np.log(x)-mean_gaussian)/std_gaussian)**2/2) ) + exact_mean = np.exp(mean_gaussian + std_gaussian**2/2) + exact_std = np.sqrt( (np.exp(std_gaussian**2)-1) * + np.exp(2*mean_gaussian + std_gaussian**2) ) + + assert len(samples) == sample_size + npt.assert_allclose([samples.mean(), samples.std()], + [exact_mean, exact_std], atol=exact_mean/100) + npt.assert_allclose(histogram, exact_dist, atol=exact_dist.mean()/20) + + +@pytest.mark.parametrize( + "use_kernel", + [False, True], +) +def test_custom(use_kernel): + # test that the sample has approximately the right distribution + # function, with both Custom and CustomKernel method. The latter + # is less accurate for smooth functions. + # the distribution function is an inverted parabola, with maximum 0.15, + # which is 0 at the bounds (0,10) (normalized) + norm = 500/3. + function = lambda x: (-(5 - x)**2 + 25)/norm + max_function = 0.15 + sample_size = 2000000 + + if use_kernel: + variable = np.linspace(0.1,9.9,100) + frequencies = function(variable) + samples = sampleable.CustomKernel(variable, frequencies, + kernel_bandwidth=0.1 + ).generate_samples(sample_size) + tolerance = max_function/10 + else: + samples = sampleable.Custom((0, 10), function, max_function + ).generate_samples(sample_size) + tolerance = max_function/50 + + histogram, bins = np.histogram(samples, bins=100, density=True) + correct_dist = function((bins[1:]+bins[:-1])/2) + assert len(samples) == sample_size + npt.assert_allclose(histogram, correct_dist, atol=tolerance) From 0cc1e85ececa08ce1a69c2ac28a7f63a81a2381b Mon Sep 17 00:00:00 2001 From: Nicolas Mounet Date: Tue, 1 Jun 2021 21:40:07 +0200 Subject: [PATCH 2/7] Adding new sampleable distributions: lognormal and custom (with two different methods) - thanks to Markus Rognlien --- cara/monte_carlo/sampleable.py | 70 ++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/cara/monte_carlo/sampleable.py b/cara/monte_carlo/sampleable.py index 4ed49d82..df7a7646 100644 --- a/cara/monte_carlo/sampleable.py +++ b/cara/monte_carlo/sampleable.py @@ -1,6 +1,7 @@ import typing import numpy as np +from sklearn.neighbors import KernelDensity # type: ignore import cara.models @@ -16,6 +17,9 @@ class SampleableDistribution: class Normal(SampleableDistribution): + """ + Defines a normal (i.e. Gaussian) distribution + """ def __init__(self, mean: float, standard_deviation: float): self.mean = mean self.standard_deviation = standard_deviation @@ -24,6 +28,72 @@ class Normal(SampleableDistribution): return np.random.normal(self.mean, self.standard_deviation, size=size) +class LogNormal(SampleableDistribution): + """ + Defines a lognormal distribution (i.e. Gaussian distribution vs. the + natural logarithm of the random variable) + """ + + def __init__(self, mean_gaussian: float, standard_deviation_gaussian: float): + # these are resp. the mean and std. deviation of the underlying + # Gaussian distribution + self.mean_gaussian = mean_gaussian + self.standard_deviation_gaussian = standard_deviation_gaussian + + def generate_samples(self, size: int) -> float_array_size_n: + return np.random.lognormal(self.mean_gaussian, + self.standard_deviation_gaussian, + size=size) + + +class Custom(SampleableDistribution): + """ + Defines a distribution which follows a custom curve vs. the random + variable. Uses a simple algorithm. This is appropriate for a smooth + distribution function (one should know its maximum). + """ + def __init__(self, bounds: typing.Tuple[float, float], + function: typing.Callable, max_function: float): + self.bounds = bounds + self.function = function + self.max_function = max_function + + def generate_samples(self, size: int) -> float_array_size_n: + fvalue = np.random.uniform(0,self.max_function,size) + x = np.random.uniform(*self.bounds,size) + invalid = np.where(fvalue>self.function(x))[0] + while len(invalid)>0: + fvalue[invalid] = np.random.uniform(0,self.max_function,len(invalid)) + x[invalid] = np.random.uniform(*self.bounds,len(invalid)) + invalid = np.where(fvalue>self.function(x))[0] + + return x + + +class CustomKernel(SampleableDistribution): + """ + Defines a distribution which follows a custom curve vs. the + random variable. Uses a Gaussian kernel density fit. This is more + appropriate for a noisy distribution function. + """ + def __init__(self, variable: float_array_size_n, + frequencies: float_array_size_n, + kernel_bandwidth: float): + # these are resp. the random variable, the distribution + # frequencies at these values, and the bandwidth of the Gaussian + # kernel + self.variable = variable + self.frequencies = frequencies + self.kernel_bandwidth = kernel_bandwidth + + def generate_samples(self, size: int) -> float_array_size_n: + kde_model = KernelDensity(kernel='gaussian', + bandwidth=self.kernel_bandwidth) + kde_model.fit(self.variable.reshape(-1, 1), + sample_weight=self.frequencies) + return kde_model.sample(n_samples=size)[:, 0] + + _VectorisedFloatOrSampleable = typing.Union[ SampleableDistribution, cara.models._VectorisedFloat, ] From 867536295019f0cf4ba9aa9dee40b2f549966faa Mon Sep 17 00:00:00 2001 From: Nicolas Mounet Date: Tue, 1 Jun 2021 21:48:04 +0200 Subject: [PATCH 3/7] Minor fix in docstring --- cara/monte_carlo/sampleable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cara/monte_carlo/sampleable.py b/cara/monte_carlo/sampleable.py index df7a7646..95374370 100644 --- a/cara/monte_carlo/sampleable.py +++ b/cara/monte_carlo/sampleable.py @@ -79,7 +79,7 @@ class CustomKernel(SampleableDistribution): def __init__(self, variable: float_array_size_n, frequencies: float_array_size_n, kernel_bandwidth: float): - # these are resp. the random variable, the distribution + # these are resp. the random variable, the distribution # frequencies at these values, and the bandwidth of the Gaussian # kernel self.variable = variable From 185d57639744e8f4bcb68635b8f8b6d2a370568e Mon Sep 17 00:00:00 2001 From: Nicolas Mounet Date: Wed, 2 Jun 2021 07:16:00 +0200 Subject: [PATCH 4/7] Adding sklearn to the dependencies --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index e99d3185..3d5fc17b 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ REQUIREMENTS: dict = { 'numpy', 'qrcode[pil]', 'scipy', + 'sklearn', 'tornado', 'voila >=0.2.4', ], From b7f4ad8c3e1af27ce7d30d2b55869e5e96301ddb Mon Sep 17 00:00:00 2001 From: Nicolas Mounet Date: Wed, 2 Jun 2021 09:06:38 +0200 Subject: [PATCH 5/7] Adding a TODO in sampleable tests --- cara/tests/test_sampleable_distribution.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cara/tests/test_sampleable_distribution.py b/cara/tests/test_sampleable_distribution.py index b53f9ad6..b3887763 100644 --- a/cara/tests/test_sampleable_distribution.py +++ b/cara/tests/test_sampleable_distribution.py @@ -3,6 +3,8 @@ import numpy.testing as npt import pytest from cara.monte_carlo import sampleable +# TODO: seed deterministically the random number generators, here (to +# avoid random issues with tests) @pytest.mark.parametrize( "mean, std",[ From 61b06fc1f0bd80f96472fba85a31b34d20f4d055 Mon Sep 17 00:00:00 2001 From: Nicolas Mounet Date: Wed, 2 Jun 2021 12:43:10 +0200 Subject: [PATCH 6/7] Adding LogCustomKernel sampleable distribution (curve is vs. a log variable) --- cara/monte_carlo/sampleable.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/cara/monte_carlo/sampleable.py b/cara/monte_carlo/sampleable.py index 95374370..4333c93f 100644 --- a/cara/monte_carlo/sampleable.py +++ b/cara/monte_carlo/sampleable.py @@ -94,6 +94,30 @@ class CustomKernel(SampleableDistribution): return kde_model.sample(n_samples=size)[:, 0] +class LogCustomKernel(SampleableDistribution): + """ + Defines a distribution which follows a custom curve vs. the log + (in base 10) of the random variable. Uses a Gaussian kernel density + fit. This is more appropriate for a noisy distribution function. + """ + def __init__(self, log_variable: float_array_size_n, + frequencies: float_array_size_n, + kernel_bandwidth: float): + # these are resp. the log of the random variable, the distribution + # frequencies at these values, and the bandwidth of the Gaussian + # kernel + self.log_variable = log_variable + self.frequencies = frequencies + self.kernel_bandwidth = kernel_bandwidth + + def generate_samples(self, size: int) -> float_array_size_n: + kde_model = KernelDensity(kernel='gaussian', + bandwidth=self.kernel_bandwidth) + kde_model.fit(self.log_variable.reshape(-1, 1), + sample_weight=self.frequencies) + return 10 ** kde_model.sample(n_samples=size)[:, 0] + + _VectorisedFloatOrSampleable = typing.Union[ SampleableDistribution, cara.models._VectorisedFloat, ] From 387fda67d844ce875f529d326b8665b562975629 Mon Sep 17 00:00:00 2001 From: Nicolas Mounet Date: Wed, 2 Jun 2021 12:45:49 +0200 Subject: [PATCH 7/7] Improving tests on sampleable distributions (more robust; using seed); adding test for LogCustomKernel --- cara/tests/test_sampleable_distribution.py | 57 ++++++++++++++++------ 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/cara/tests/test_sampleable_distribution.py b/cara/tests/test_sampleable_distribution.py index b3887763..e3801c7e 100644 --- a/cara/tests/test_sampleable_distribution.py +++ b/cara/tests/test_sampleable_distribution.py @@ -3,8 +3,8 @@ import numpy.testing as npt import pytest from cara.monte_carlo import sampleable -# TODO: seed deterministically the random number generators, here (to -# avoid random issues with tests) +# TODO: seed better the random number generators +np.random.seed(2000) @pytest.mark.parametrize( "mean, std",[ @@ -17,12 +17,14 @@ def test_normal(mean, std): sample_size = 2000000 samples = sampleable.Normal(mean, std).generate_samples(sample_size) histogram, bins = np.histogram(samples,bins=100, density=True) - x = (bins[1:]+bins[:-1])/2 - exact_dist = 1/(np.sqrt(2*np.pi)*std) * np.exp(-((x-mean)/std)**2/2) + selected_bins,selected_histogram = zip(*[(b,h) for b,h in zip( + (bins[1:]+bins[:-1])/2,histogram) if b>=0.25 and b<=1.75]) + exact_dist = 1/(np.sqrt(2*np.pi)*std) * np.exp( + -((np.array(selected_bins)-mean)/std)**2/2) assert len(samples) == sample_size - npt.assert_allclose([samples.mean(), samples.std()], [mean, std], atol=mean/100) - npt.assert_allclose(histogram, exact_dist, atol=exact_dist.mean()/20) + npt.assert_allclose([samples.mean(), samples.std()], [mean, std], rtol=0.01) + npt.assert_allclose(selected_histogram, exact_dist, rtol=0.02) @pytest.mark.parametrize( @@ -37,17 +39,19 @@ def test_lognormal(mean_gaussian, std_gaussian): samples = sampleable.LogNormal(mean_gaussian, std_gaussian ).generate_samples(sample_size) histogram, bins = np.histogram(samples,bins=50, density=True) - x = (bins[1:]+bins[:-1])/2 - exact_dist = ( 1/(x*np.sqrt(2*np.pi)*std_gaussian) * - np.exp(-((np.log(x)-mean_gaussian)/std_gaussian)**2/2) ) + selected_bins,selected_histogram = zip(*[(b,h) for b,h in zip( + (bins[1:]+bins[:-1])/2,histogram) if b>=0.4 and b<=0.6]) + selected_bins = np.array(selected_bins) + exact_dist = ( 1/(selected_bins*np.sqrt(2*np.pi)*std_gaussian) * + np.exp(-((np.log(selected_bins)-mean_gaussian)/std_gaussian)**2/2) ) exact_mean = np.exp(mean_gaussian + std_gaussian**2/2) exact_std = np.sqrt( (np.exp(std_gaussian**2)-1) * np.exp(2*mean_gaussian + std_gaussian**2) ) assert len(samples) == sample_size npt.assert_allclose([samples.mean(), samples.std()], - [exact_mean, exact_std], atol=exact_mean/100) - npt.assert_allclose(histogram, exact_dist, atol=exact_dist.mean()/20) + [exact_mean, exact_std], rtol=0.01) + npt.assert_allclose(selected_histogram, exact_dist, rtol=0.02) @pytest.mark.parametrize( @@ -71,13 +75,36 @@ def test_custom(use_kernel): samples = sampleable.CustomKernel(variable, frequencies, kernel_bandwidth=0.1 ).generate_samples(sample_size) - tolerance = max_function/10 else: samples = sampleable.Custom((0, 10), function, max_function ).generate_samples(sample_size) - tolerance = max_function/50 histogram, bins = np.histogram(samples, bins=100, density=True) - correct_dist = function((bins[1:]+bins[:-1])/2) + selected_bins,selected_histogram = zip(*[(b,h) for b,h in zip( + (bins[1:]+bins[:-1])/2,histogram) if b>=1 and b<=9]) + correct_dist = function(np.array(selected_bins)) assert len(samples) == sample_size - npt.assert_allclose(histogram, correct_dist, atol=tolerance) + npt.assert_allclose(selected_histogram, correct_dist, rtol=0.05) + + +def test_logcustomkernel(): + # test that the sample has approximately the right distribution + # function, for the LogCustomKernel. + # the distribution function is an inverted parabola vs. the log of + # the variable (normalized) + norm = 500/3. + function = lambda x: (-(5 - x)**2 + 25)/norm + sample_size = 2000000 + + log_variable = np.linspace(0.1,9.9,100) + frequencies = function(log_variable) + samples = sampleable.LogCustomKernel(log_variable, frequencies, + kernel_bandwidth=0.1 + ).generate_samples(sample_size) + + histogram, bins = np.histogram(np.log10(samples), bins=100, density=True) + selected_bins,selected_histogram = zip(*[(b,h) for b,h in zip( + (bins[1:]+bins[:-1])/2,histogram) if b>=1 and b<=9]) + correct_dist = function(np.array(selected_bins)) + assert len(samples) == sample_size + npt.assert_allclose(selected_histogram, correct_dist, rtol=0.05)