Mahoutというか,その中にはそれっぽいライブラリが見つからなかったのだけど,科学技術計算の実験系で人工データを作りたいときに困ったので作ってみた.
import java.io.Serializable; import java.util.Random; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.Vector; public class SyntheticDataGenerator implements Serializable { private static final long serialVersionUID = 1L; private Random[] randoms; private double[] means; private double[] stdevs; public SyntheticDataGenerator(long seed) { means = new double[1]; means[0] = 0; stdevs = new double[1]; stdevs[0] = 1; init(seed, 1, means, stdevs); } public SyntheticDataGenerator(long seed, int cardinality, double[] means, double[] stdevs) { init(seed, cardinality, means, stdevs); } private void init(long seed, int cardinality, double[] means, double[] stdevs) { if (cardinality != means.length || cardinality != stdevs.length) { throw new IllegalArgumentException("Invalid cardinality."); } randoms = new Random[cardinality]; for (int i = 0; i < cardinality; i++) { randoms[i] = new Random(seed + i); } this.means = means; this.stdevs = stdevs; } public double nextDouble() { return nextDouble(0); } protected double nextDouble(int i) { return randoms[i].nextGaussian() * stdevs[i] + means[i]; } public double[] nextDoubles() { double[] values = new double[randoms.length]; for (int i = 0; i < randoms.length; i++) { values[i] = nextDouble(i); } return values; } public Vector nextVector() { return new DenseVector(nextDoubles()); } }
1次元の正規分布に基づく人工データを作りたいときには以下な感じ.
double[] means = new double[1]; double[] stdevs = new double[1]; means[0] = 10; // 平均 10 stdevs[0] = 5; // 標準偏差 5 SyntheticDataGenerator generator = new SyntheticDataGenerator(0, 1, means, stdevs);
あとは,generator.nextDouble() で値を取得していくと指定した分布の乱数が取得できる.そんで,多次元のデータが欲しい場合は,各次元ごとのmeansとstdevsを配列に格納して,generator.nextDoubles() としてやれば配列がとれるし,nextVector()でMahoutのVectorとして取得できる.