From bef2d4949ebc3a097bceccc503bc6cf711555623 Mon Sep 17 00:00:00 2001
From: Nicolas Mounet <nicolas.mounet@cern.ch>
Date: Tue, 1 Jun 2021 21:39:01 +0200
Subject: [PATCH 1/7] Adding tests on different kind of sampleable
 distributions

---
 cara/tests/test_sampleable_distribution.py | 81 ++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 cara/tests/test_sampleable_distribution.py

diff --git a/cara/tests/test_sampleable_distribution.py b/cara/tests/test_sampleable_distribution.py
new file mode 100644
index 00000000..b53f9ad6
--- /dev/null
+++ b/cara/tests/test_sampleable_distribution.py
@@ -0,0 +1,81 @@
+import numpy as np
+import numpy.testing as npt
+import pytest
+
+from cara.monte_carlo import sampleable
+
+@pytest.mark.parametrize(
+    "mean, std",[
+        [1., 0.5],
+    ]
+)
+def test_normal(mean, std):
+    # test that the sample has approximately the right mean,
+    # std deviation and distribution function.
+    sample_size = 2000000
+    samples = sampleable.Normal(mean, std).generate_samples(sample_size)
+    histogram, bins = np.histogram(samples,bins=100, density=True)
+    x = (bins[1:]+bins[:-1])/2
+    exact_dist = 1/(np.sqrt(2*np.pi)*std) * np.exp(-((x-mean)/std)**2/2)
+
+    assert len(samples) == sample_size
+    npt.assert_allclose([samples.mean(), samples.std()], [mean, std], atol=mean/100)
+    npt.assert_allclose(histogram, exact_dist, atol=exact_dist.mean()/20)
+
+
+@pytest.mark.parametrize(
+    "mean_gaussian, std_gaussian",[
+        [-0.6872121723362303, 0.10498338229297108],
+    ]
+)
+def test_lognormal(mean_gaussian, std_gaussian):
+    # test that the sample has approximately the right mean,
+    # std deviation and distribution function.
+    sample_size = 2000000
+    samples = sampleable.LogNormal(mean_gaussian, std_gaussian
+                                ).generate_samples(sample_size)
+    histogram, bins = np.histogram(samples,bins=50, density=True)
+    x = (bins[1:]+bins[:-1])/2
+    exact_dist = ( 1/(x*np.sqrt(2*np.pi)*std_gaussian) * 
+                   np.exp(-((np.log(x)-mean_gaussian)/std_gaussian)**2/2) )
+    exact_mean = np.exp(mean_gaussian + std_gaussian**2/2)
+    exact_std = np.sqrt( (np.exp(std_gaussian**2)-1) *
+                         np.exp(2*mean_gaussian + std_gaussian**2) )
+
+    assert len(samples) == sample_size
+    npt.assert_allclose([samples.mean(), samples.std()],
+                        [exact_mean, exact_std], atol=exact_mean/100)
+    npt.assert_allclose(histogram, exact_dist, atol=exact_dist.mean()/20)
+
+
+@pytest.mark.parametrize(
+    "use_kernel",
+    [False, True],
+)
+def test_custom(use_kernel):
+    # test that the sample has approximately the right distribution
+    # function, with both Custom and CustomKernel method. The latter
+    # is less accurate for smooth functions.
+    # the distribution function is an inverted parabola, with maximum 0.15,
+    # which is 0 at the bounds (0,10) (normalized)
+    norm = 500/3.
+    function = lambda x: (-(5 - x)**2 + 25)/norm
+    max_function = 0.15
+    sample_size = 2000000
+
+    if use_kernel:
+        variable = np.linspace(0.1,9.9,100) 
+        frequencies = function(variable)
+        samples = sampleable.CustomKernel(variable, frequencies,
+                                    kernel_bandwidth=0.1
+                                    ).generate_samples(sample_size)
+        tolerance = max_function/10
+    else:
+        samples = sampleable.Custom((0, 10), function, max_function
+                                    ).generate_samples(sample_size)
+        tolerance = max_function/50
+
+    histogram, bins = np.histogram(samples, bins=100, density=True)
+    correct_dist = function((bins[1:]+bins[:-1])/2)
+    assert len(samples) == sample_size
+    npt.assert_allclose(histogram, correct_dist, atol=tolerance)

From 0cc1e85ececa08ce1a69c2ac28a7f63a81a2381b Mon Sep 17 00:00:00 2001
From: Nicolas Mounet <nicolas.mounet@cern.ch>
Date: Tue, 1 Jun 2021 21:40:07 +0200
Subject: [PATCH 2/7] Adding new sampleable distributions: lognormal and custom
 (with two different methods) - thanks to Markus Rognlien

---
 cara/monte_carlo/sampleable.py | 70 ++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/cara/monte_carlo/sampleable.py b/cara/monte_carlo/sampleable.py
index 4ed49d82..df7a7646 100644
--- a/cara/monte_carlo/sampleable.py
+++ b/cara/monte_carlo/sampleable.py
@@ -1,6 +1,7 @@
 import typing
 
 import numpy as np
+from sklearn.neighbors import KernelDensity # type: ignore
 
 import cara.models
 
@@ -16,6 +17,9 @@ class SampleableDistribution:
 
 
 class Normal(SampleableDistribution):
+    """
+    Defines a normal (i.e. Gaussian) distribution
+    """
     def __init__(self, mean: float, standard_deviation: float):
         self.mean = mean
         self.standard_deviation = standard_deviation
@@ -24,6 +28,72 @@ class Normal(SampleableDistribution):
         return np.random.normal(self.mean, self.standard_deviation, size=size)
 
 
+class LogNormal(SampleableDistribution):
+    """
+    Defines a lognormal distribution (i.e. Gaussian distribution vs. the
+    natural logarithm of the random variable)
+    """
+
+    def __init__(self, mean_gaussian: float, standard_deviation_gaussian: float):
+        # these are resp. the mean and std. deviation of the underlying
+        # Gaussian distribution
+        self.mean_gaussian = mean_gaussian
+        self.standard_deviation_gaussian = standard_deviation_gaussian
+
+    def generate_samples(self, size: int) -> float_array_size_n:
+        return np.random.lognormal(self.mean_gaussian,
+                                   self.standard_deviation_gaussian,
+                                   size=size)
+
+
+class Custom(SampleableDistribution):
+    """
+    Defines a distribution which follows a custom curve vs. the random
+    variable. Uses a simple algorithm. This is appropriate for a smooth
+    distribution function (one should know its maximum).
+    """
+    def __init__(self, bounds: typing.Tuple[float, float],
+                 function: typing.Callable, max_function: float):
+        self.bounds = bounds
+        self.function = function
+        self.max_function = max_function
+
+    def generate_samples(self, size: int) -> float_array_size_n:
+        fvalue = np.random.uniform(0,self.max_function,size)
+        x = np.random.uniform(*self.bounds,size)
+        invalid = np.where(fvalue>self.function(x))[0]
+        while len(invalid)>0:
+            fvalue[invalid] = np.random.uniform(0,self.max_function,len(invalid))
+            x[invalid] = np.random.uniform(*self.bounds,len(invalid))
+            invalid = np.where(fvalue>self.function(x))[0]
+
+        return x
+
+
+class CustomKernel(SampleableDistribution):
+    """
+    Defines a distribution which follows a custom curve vs. the
+    random variable. Uses a Gaussian kernel density fit. This is more
+    appropriate for a noisy distribution function.
+    """
+    def __init__(self, variable: float_array_size_n,
+                 frequencies: float_array_size_n,
+                 kernel_bandwidth: float):
+        # these are resp. the random variable, the distribution 
+        # frequencies at these values, and the bandwidth of the Gaussian
+        # kernel
+        self.variable = variable
+        self.frequencies = frequencies
+        self.kernel_bandwidth = kernel_bandwidth
+
+    def generate_samples(self, size: int) -> float_array_size_n:
+        kde_model = KernelDensity(kernel='gaussian',
+                                  bandwidth=self.kernel_bandwidth)
+        kde_model.fit(self.variable.reshape(-1, 1),
+                      sample_weight=self.frequencies)
+        return kde_model.sample(n_samples=size)[:, 0]
+
+
 _VectorisedFloatOrSampleable = typing.Union[
     SampleableDistribution, cara.models._VectorisedFloat,
 ]

From 867536295019f0cf4ba9aa9dee40b2f549966faa Mon Sep 17 00:00:00 2001
From: Nicolas Mounet <nicolas.mounet@cern.ch>
Date: Tue, 1 Jun 2021 21:48:04 +0200
Subject: [PATCH 3/7] Minor fix in docstring

---
 cara/monte_carlo/sampleable.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cara/monte_carlo/sampleable.py b/cara/monte_carlo/sampleable.py
index df7a7646..95374370 100644
--- a/cara/monte_carlo/sampleable.py
+++ b/cara/monte_carlo/sampleable.py
@@ -79,7 +79,7 @@ class CustomKernel(SampleableDistribution):
     def __init__(self, variable: float_array_size_n,
                  frequencies: float_array_size_n,
                  kernel_bandwidth: float):
-        # these are resp. the random variable, the distribution 
+        # these are resp. the random variable, the distribution
         # frequencies at these values, and the bandwidth of the Gaussian
         # kernel
         self.variable = variable

From 185d57639744e8f4bcb68635b8f8b6d2a370568e Mon Sep 17 00:00:00 2001
From: Nicolas Mounet <nicolas.mounet@cern.ch>
Date: Wed, 2 Jun 2021 07:16:00 +0200
Subject: [PATCH 4/7] Adding sklearn to the dependencies

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index e99d3185..3d5fc17b 100644
--- a/setup.py
+++ b/setup.py
@@ -29,6 +29,7 @@ REQUIREMENTS: dict = {
         'numpy',
         'qrcode[pil]',
         'scipy',
+        'sklearn',
         'tornado',
         'voila >=0.2.4',
     ],

From b7f4ad8c3e1af27ce7d30d2b55869e5e96301ddb Mon Sep 17 00:00:00 2001
From: Nicolas Mounet <nicolas.mounet@cern.ch>
Date: Wed, 2 Jun 2021 09:06:38 +0200
Subject: [PATCH 5/7] Adding a TODO in sampleable tests

---
 cara/tests/test_sampleable_distribution.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cara/tests/test_sampleable_distribution.py b/cara/tests/test_sampleable_distribution.py
index b53f9ad6..b3887763 100644
--- a/cara/tests/test_sampleable_distribution.py
+++ b/cara/tests/test_sampleable_distribution.py
@@ -3,6 +3,8 @@ import numpy.testing as npt
 import pytest
 
 from cara.monte_carlo import sampleable
+# TODO: seed deterministically the random number generators, here (to
+# avoid random issues with tests)
 
 @pytest.mark.parametrize(
     "mean, std",[

From 61b06fc1f0bd80f96472fba85a31b34d20f4d055 Mon Sep 17 00:00:00 2001
From: Nicolas Mounet <nicolas.mounet@cern.ch>
Date: Wed, 2 Jun 2021 12:43:10 +0200
Subject: [PATCH 6/7] Adding LogCustomKernel sampleable distribution (curve is
 vs. a log variable)

---
 cara/monte_carlo/sampleable.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/cara/monte_carlo/sampleable.py b/cara/monte_carlo/sampleable.py
index 95374370..4333c93f 100644
--- a/cara/monte_carlo/sampleable.py
+++ b/cara/monte_carlo/sampleable.py
@@ -94,6 +94,30 @@ class CustomKernel(SampleableDistribution):
         return kde_model.sample(n_samples=size)[:, 0]
 
 
+class LogCustomKernel(SampleableDistribution):
+    """
+    Defines a distribution which follows a custom curve vs. the log
+    (in base 10) of the random variable. Uses a Gaussian kernel density
+    fit. This is more appropriate for a noisy distribution function.
+    """
+    def __init__(self, log_variable: float_array_size_n,
+                 frequencies: float_array_size_n,
+                 kernel_bandwidth: float):
+        # these are resp. the log of the random variable, the distribution
+        # frequencies at these values, and the bandwidth of the Gaussian
+        # kernel
+        self.log_variable = log_variable
+        self.frequencies = frequencies
+        self.kernel_bandwidth = kernel_bandwidth
+
+    def generate_samples(self, size: int) -> float_array_size_n:
+        kde_model = KernelDensity(kernel='gaussian',
+                                  bandwidth=self.kernel_bandwidth)
+        kde_model.fit(self.log_variable.reshape(-1, 1),
+                      sample_weight=self.frequencies)
+        return 10 ** kde_model.sample(n_samples=size)[:, 0]
+
+
 _VectorisedFloatOrSampleable = typing.Union[
     SampleableDistribution, cara.models._VectorisedFloat,
 ]

From 387fda67d844ce875f529d326b8665b562975629 Mon Sep 17 00:00:00 2001
From: Nicolas Mounet <nicolas.mounet@cern.ch>
Date: Wed, 2 Jun 2021 12:45:49 +0200
Subject: [PATCH 7/7] Improving tests on sampleable distributions (more robust;
 using seed); adding test for LogCustomKernel

---
 cara/tests/test_sampleable_distribution.py | 57 ++++++++++++++++------
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/cara/tests/test_sampleable_distribution.py b/cara/tests/test_sampleable_distribution.py
index b3887763..e3801c7e 100644
--- a/cara/tests/test_sampleable_distribution.py
+++ b/cara/tests/test_sampleable_distribution.py
@@ -3,8 +3,8 @@ import numpy.testing as npt
 import pytest
 
 from cara.monte_carlo import sampleable
-# TODO: seed deterministically the random number generators, here (to
-# avoid random issues with tests)
+# TODO: seed better the random number generators
+np.random.seed(2000)
 
 @pytest.mark.parametrize(
     "mean, std",[
@@ -17,12 +17,14 @@ def test_normal(mean, std):
     sample_size = 2000000
     samples = sampleable.Normal(mean, std).generate_samples(sample_size)
     histogram, bins = np.histogram(samples,bins=100, density=True)
-    x = (bins[1:]+bins[:-1])/2
-    exact_dist = 1/(np.sqrt(2*np.pi)*std) * np.exp(-((x-mean)/std)**2/2)
+    selected_bins,selected_histogram = zip(*[(b,h) for b,h in zip(
+                (bins[1:]+bins[:-1])/2,histogram) if b>=0.25 and b<=1.75])
+    exact_dist = 1/(np.sqrt(2*np.pi)*std) * np.exp(
+                    -((np.array(selected_bins)-mean)/std)**2/2)
 
     assert len(samples) == sample_size
-    npt.assert_allclose([samples.mean(), samples.std()], [mean, std], atol=mean/100)
-    npt.assert_allclose(histogram, exact_dist, atol=exact_dist.mean()/20)
+    npt.assert_allclose([samples.mean(), samples.std()], [mean, std], rtol=0.01)
+    npt.assert_allclose(selected_histogram, exact_dist, rtol=0.02)
 
 
 @pytest.mark.parametrize(
@@ -37,17 +39,19 @@ def test_lognormal(mean_gaussian, std_gaussian):
     samples = sampleable.LogNormal(mean_gaussian, std_gaussian
                                 ).generate_samples(sample_size)
     histogram, bins = np.histogram(samples,bins=50, density=True)
-    x = (bins[1:]+bins[:-1])/2
-    exact_dist = ( 1/(x*np.sqrt(2*np.pi)*std_gaussian) * 
-                   np.exp(-((np.log(x)-mean_gaussian)/std_gaussian)**2/2) )
+    selected_bins,selected_histogram = zip(*[(b,h) for b,h in zip(
+                (bins[1:]+bins[:-1])/2,histogram) if b>=0.4 and b<=0.6])
+    selected_bins = np.array(selected_bins)
+    exact_dist = ( 1/(selected_bins*np.sqrt(2*np.pi)*std_gaussian) *
+            np.exp(-((np.log(selected_bins)-mean_gaussian)/std_gaussian)**2/2) )
     exact_mean = np.exp(mean_gaussian + std_gaussian**2/2)
     exact_std = np.sqrt( (np.exp(std_gaussian**2)-1) *
                          np.exp(2*mean_gaussian + std_gaussian**2) )
 
     assert len(samples) == sample_size
     npt.assert_allclose([samples.mean(), samples.std()],
-                        [exact_mean, exact_std], atol=exact_mean/100)
-    npt.assert_allclose(histogram, exact_dist, atol=exact_dist.mean()/20)
+                        [exact_mean, exact_std], rtol=0.01)
+    npt.assert_allclose(selected_histogram, exact_dist, rtol=0.02)
 
 
 @pytest.mark.parametrize(
@@ -71,13 +75,36 @@ def test_custom(use_kernel):
         samples = sampleable.CustomKernel(variable, frequencies,
                                     kernel_bandwidth=0.1
                                     ).generate_samples(sample_size)
-        tolerance = max_function/10
     else:
         samples = sampleable.Custom((0, 10), function, max_function
                                     ).generate_samples(sample_size)
-        tolerance = max_function/50
 
     histogram, bins = np.histogram(samples, bins=100, density=True)
-    correct_dist = function((bins[1:]+bins[:-1])/2)
+    selected_bins,selected_histogram = zip(*[(b,h) for b,h in zip(
+                (bins[1:]+bins[:-1])/2,histogram) if b>=1 and b<=9])
+    correct_dist = function(np.array(selected_bins))
     assert len(samples) == sample_size
-    npt.assert_allclose(histogram, correct_dist, atol=tolerance)
+    npt.assert_allclose(selected_histogram, correct_dist, rtol=0.05)
+
+
+def test_logcustomkernel():
+    # test that the sample has approximately the right distribution
+    # function, for the LogCustomKernel.
+    # the distribution function is an inverted parabola vs. the log of
+    # the variable (normalized)
+    norm = 500/3.
+    function = lambda x: (-(5 - x)**2 + 25)/norm
+    sample_size = 2000000
+
+    log_variable = np.linspace(0.1,9.9,100)
+    frequencies = function(log_variable)
+    samples = sampleable.LogCustomKernel(log_variable, frequencies,
+                                kernel_bandwidth=0.1
+                                ).generate_samples(sample_size)
+
+    histogram, bins = np.histogram(np.log10(samples), bins=100, density=True)
+    selected_bins,selected_histogram = zip(*[(b,h) for b,h in zip(
+                (bins[1:]+bins[:-1])/2,histogram) if b>=1 and b<=9])
+    correct_dist = function(np.array(selected_bins))
+    assert len(samples) == sample_size
+    npt.assert_allclose(selected_histogram, correct_dist, rtol=0.05)