本文整理汇总了Java中org.deeplearning4j.models.word2vec.Word2Vec类的典型用法代码示例。如果您正苦于以下问题:Java Word2Vec类的具体用法?Java Word2Vec怎么用?Java Word2Vec使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
Word2Vec类属于org.deeplearning4j.models.word2vec包,在下文中一共展示了Word2Vec类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: testWriteWordVectorsFromWord2Vec
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@Test
@Ignore
public void testWriteWordVectorsFromWord2Vec() throws IOException {
WordVectors vec = WordVectorSerializer.loadGoogleModel(binaryFile, true);
WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto);
WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman");
INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano");
assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1);
assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2);
assertTrue(wordVector1.length() == 300);
assertTrue(wordVector2.length() == 300);
assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3);
assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3);
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:17,代码来源:WordVectorSerializerTest.java
示例2: testFindNamesFromText
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");
log.info("load is right!");
TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
//tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());
//Generates a word-vector from the dataset stored in resources folder
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
.learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
vec.fit();
WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));
//trains a model that can find out all names from news(Suffix txt),It uses word vector generated
// WordVectors wordVectors;
//test model,Whether the model find out name from unknow text;
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:ChineseTokenizerTest.java
示例3: resetWeights
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@Override
public void resetWeights(boolean reset) {
if (rng == null)
this.rng = Nd4j.getRandom();
//note the +2 which is the unk vocab word and the bias
if (syn0 == null || reset) {
syn0 = Nd4j.rand(new int[] {vocab.numWords() + 1, vectorLength}, rng).subi(0.5).divi((double) vectorLength);
INDArray randUnk = Nd4j.rand(1, vectorLength, rng).subi(0.5).divi(vectorLength);
putVector(Word2Vec.DEFAULT_UNK, randUnk);
}
if (weightAdaGrad == null || reset) {
weightAdaGrad = new AdaGrad(new int[] {vocab.numWords() + 1, vectorLength}, lr.get());
}
//right after unknown
if (bias == null || reset)
bias = Nd4j.create(syn0.rows());
if (biasAdaGrad == null || reset) {
biasAdaGrad = new AdaGrad(bias.shape(), lr.get());
}
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:27,代码来源:GloveWeightLookupTable.java
示例4: asExampleArray
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
/**
* Converts a window (each word in the window)
*
* in to a vector.
*
* Keep in mind each window is a multi word context.
*
* From there, each word uses the passed in model
* as a lookup table to getFromOrigin what vectors are relevant
* to the passed in windows
* @param window the window to take in.
* @param vec the model to use as a lookup table
* @return a concacneated 1 row array
* containing all of the numbers for each word in the window
*/
public static INDArray asExampleArray(Window window, Word2Vec vec, boolean normalize) {
int length = vec.lookupTable().layerSize();
List<String> words = window.getWords();
int windowSize = vec.getWindow();
assert words.size() == vec.getWindow();
INDArray ret = Nd4j.create(length * windowSize);
for (int i = 0; i < words.size(); i++) {
String word = words.get(i);
INDArray n = normalize ? vec.getWordVectorMatrixNormalized(word) : vec.getWordVectorMatrix(word);
ret.put(new INDArrayIndex[] {NDArrayIndex.interval(i * vec.lookupTable().layerSize(),
i * vec.lookupTable().layerSize() + vec.lookupTable().layerSize())}, n);
}
return ret;
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:34,代码来源:WindowConverter.java
示例5: testWord2VecPlot
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@Test
public void testWord2VecPlot() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
.layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
.tokenizerFactory(t).build();
vec.fit();
// UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();
// vec.getLookupTable().plotVocab(100, connectionInfo);
Thread.sleep(10000000000L);
fail("Not implemented");
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:23,代码来源:ManualTests.java
示例6: main
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
// Gets Path to Text file
String filePath = "c:/raw_sentences.txt";
log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
InMemoryLookupCache cache = new InMemoryLookupCache();
WeightLookupTable table = new InMemoryLookupTable.Builder()
.vectorLength(100)
.useAdaGrad(false)
.cache(cache)
.lr(0.025f).build();
log.info("Building model....");
Word2Vec vec = new Word2Vec.Builder()
.minWordFrequency(5).iterations(1)
.layerSize(100).lookupTable(table)
.stopWords(new ArrayList<String>())
.vocabCache(cache).seed(42)
.windowSize(5).iterate(iter).tokenizerFactory(t).build();
log.info("Fitting Word2Vec model....");
vec.fit();
log.info("Writing word vectors to text file....");
// Write word
WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");
log.info("Closest Words:");
Collection<String> lst = vec.wordsNearest("man", 5);
System.out.println(lst);
double cosSim = vec.similarity("cruise", "voyage");
System.out.println(cosSim);
}
开发者ID:PacktPublishing,项目名称:Java-Data-Science-Cookbook,代码行数:41,代码来源:Word2VecRawTextExample.java
示例7: w2vBuilder
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
public static Word2Vec w2vBuilder(SentenceIterator iter, TokenizerFactory t) {
return new Word2Vec.Builder()
.seed(12345)
.iterate(iter)
.tokenizerFactory(t)
.batchSize(1000)
.allowParallelTokenization(true) // enable parallel tokenization
.epochs(1) // number of epochs (iterations over whole training corpus) for training
.iterations(3) // number of iterations done for each mini-batch during training
.elementsLearningAlgorithm(new SkipGram<>()) // use SkipGram Model. If CBOW: new CBOW<>()
.minWordFrequency(50) // discard words that appear less than the times of set value
.windowSize(5) // set max skip length between words
.learningRate(0.05) // the starting learning rate
.minLearningRate(5e-4) // learning rate should not lower than the set threshold value
.negativeSample(10) // number of negative examples
// set threshold for occurrence of words. Those that appear with higher frequency will be
// randomly down-sampled
.sampling(1e-5)
.useHierarchicSoftmax(true) // use hierarchical softmax
.layerSize(300) // size of word vectors
.workers(8) // number of threads
.build();
}
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:24,代码来源:Word2VecTrainer.java
示例8: before
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@Before
public void before() throws Exception {
if (vec == null) {
ClassPathResource resource = new ClassPathResource("/labeled/");
File file = resource.getFile();
SentenceIterator iter = UimaSentenceIterator.createWithPath(file.getAbsolutePath());
new File("cache.ser").delete();
TokenizerFactory t = new UimaTokenizerFactory();
vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).layerSize(100)
.stopWords(new ArrayList<String>()).useUnknown(true).windowSize(5).iterate(iter)
.tokenizerFactory(t).build();
vec.fit();
}
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:18,代码来源:Word2VecIteratorTest.java
示例9: fromPair
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
/**
* Load word vectors from the given pair
*
* @param pair
* the given pair
* @return a read only word vectors impl based on the given lookup table and vocab
*/
public static Word2Vec fromPair(Pair<InMemoryLookupTable, VocabCache> pair) {
Word2Vec vectors = new Word2Vec();
vectors.setLookupTable(pair.getFirst());
vectors.setVocab(pair.getSecond());
vectors.setModelUtils(new BasicModelUtils());
return vectors;
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:15,代码来源:WordVectorSerializer.java
示例10: w2vBuilder4SmallCorpus
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@SuppressWarnings("unused")
public static Word2Vec w2vBuilder4SmallCorpus(SentenceIterator iter, TokenizerFactory t) {
return new Word2Vec.Builder()
.minWordFrequency(5)
.iterations(1)
.layerSize(100)
.seed(42)
.windowSize(5)
.iterate(iter)
.tokenizerFactory(t)
.learningRate(0.025)
.minLearningRate(1e-3)
.build();
}
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:15,代码来源:Word2VecTrainer.java
示例11: main
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
public static void main (String[] args) throws FileNotFoundException {
// download GoogleNews-vectors-negative300.bin.gz first
// load google news vectors for measurements
log.info("load word2vec model");
Word2Vec w2v = WordVectorSerializer.readWord2VecModel(
new File("/Users/zhanghao/Documents/Files/GoogleNews-vectors-negative300.bin"));
log.info("done.");
log.info("Semantic Property Task...");
// 1. TOEFL test
log.info("|********************load TOEFL data********************|");
List<Word2VecTOEFLTest.TFLNode> tflList = loadTOEFLData();
log.info("run the test");
TOEFLTest(tflList, w2v);
log.info("|*************************done.*************************|");
// 2. Analogy test -- "king - queen = man - woman"
log.info("|*******************load Syn_Sem data*******************|");
Map<String, List<Word2VecAnalogyTest.SynSemNode>> anaMap = loadSynSemData();
log.info("run the test");
AnalogyTest(anaMap, w2v);
log.info("|*************************done.*************************|");
// 3. WS353 test
log.info("|********************load WS353 data********************|");
LinkedList<Word2VecWS353Test.WS353Node> wsList = loadWS353Data("ws/ws353.txt");
LinkedList<Word2VecWS353Test.WS353Node> wsListRel = loadWS353Data("ws/ws353_relatedness.txt");
LinkedList<Word2VecWS353Test.WS353Node> wsListSim = loadWS353Data("ws/ws353_similarity.txt");
log.info("done.");
log.info("run the test");
WS353Test(w2v, wsList, "WS353");
WS353Test(w2v, wsListRel, "WS353 Relatedness");
WS353Test(w2v, wsListSim, "WS353 Similarity");
log.info("|*************************done.*************************|");
}
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:36,代码来源:DL4JWord2VecSemanticExample.java
示例12: TOEFLTest
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
private static void TOEFLTest(List<Word2VecTOEFLTest.TFLNode> tflList, Word2Vec w2v) {
int accuracy = 0;
int ignore = 0;
for (int i = 0; i < tflList.size(); i++) {
Word2VecTOEFLTest.TFLNode node = tflList.get(i);
int bestId = -1;
double cosValue = Double.MIN_VALUE;
for (int k = 0; k < node.choices.length; k++) {
double cosSim = w2v.similarity(node.ques, node.choices[k]);
if (cosSim > cosValue) {
bestId = k;
cosValue = cosSim;
}
}
tflList.get(i).setPredict(bestId);
log.info((i + 1) + "--" + tflList.get(i).toFileString() + "\n");
if (tflList.get(i).predict == -1)
ignore++;
if (tflList.get(i).ans == tflList.get(i).predict)
accuracy += 1;
}
log.info("Total Questions: " + tflList.size() + ", Ignore: " + ignore + ", Accuracy: " +
String.format("%.2f", (1.0 * accuracy) / tflList.size() * 100.0) + "%(" + accuracy + "/" +
tflList.size() + ")");
}
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:26,代码来源:DL4JWord2VecSemanticExample.java
示例13: getWord2Vec
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
private void getWord2Vec() {
t.setTokenPreProcessor(new CommonPreprocessor());
for (Language language: languages.keySet()) {
List<String> sentences = getSentencesFromLanguage(language);
SentenceIterator iter = new CollectionSentenceIterator(PREPROCESSOR, sentences);
Word2Vec vec = new Word2Vec.Builder().elementsLearningAlgorithm(learningAlgorithm)
.minWordFrequency(6)
.iterations(15)
.layerSize(VEC_LENGTH)
.seed(42)
.windowSize(5)
.iterate(iter)
.tokenizerFactory(t)
.build();
vec.fit();
saveModel(vec, language);
languageWord2VecMap.put(language, vec);
}
}
开发者ID:madeleine789,项目名称:dl4j-apr,代码行数:25,代码来源:Pan15Word2Vec.java
示例14: main
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
ClassPathResource r = new ClassPathResource("/train.tsv");
if(r.exists()) {
InputStream is = r.getInputStream();
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(new File("train.tsv")));
IOUtils.copy(is, bos);
bos.flush();
bos.close();
is.close();
}
SentenceIterator docIter = new CollectionSentenceIterator(new SentenceToPhraseMapper(new File("train.tsv")).sentences());
TokenizerFactory factory = new DefaultTokenizerFactory();
Word2Vec vec = new Word2Vec.Builder().iterate(docIter)
.tokenizerFactory(factory).batchSize(10000)
.learningRate(2.5e-2).sampling(5).learningRateDecayWords(10000)
.iterations(3).minWordFrequency(1)
.layerSize(300).windowSize(5).build();
vec.fit();
FileUtils.writeLines(new File("vocab.csv"),vec.getCache().words());
String word = "amusing";
String otherWord = "turd";
System.out.println("Words nearest " + word + " " + vec.wordsNearest(word,10));
System.out.println("Words nearest " + otherWord + " " + vec.wordsNearest(otherWord,10));
Tsne t = new Tsne.Builder()
.setMaxIter(100).stopLyingIteration(20).build();
vec.getCache().plotVocab(t);
}
开发者ID:ihuerga,项目名称:deeplearning4j-nlp-examples,代码行数:36,代码来源:Visualization.java
示例15: writeWord2VecModel
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
/**
* This method saves Word2Vec model into compressed zip file and sends it to output stream
* PLEASE NOTE: This method saves FULL model, including syn0 AND syn1
*
*/
public static void writeWord2VecModel(Word2Vec vectors, File file) {
try (FileOutputStream fos = new FileOutputStream(file);
BufferedOutputStream stream = new BufferedOutputStream(fos)) {
writeWord2VecModel(vectors, stream);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:14,代码来源:WordVectorSerializer.java
示例16: getWordEmbeddings
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
public List<double[]> getWordEmbeddings(String sentence, Language language) {
t.setTokenPreProcessor(new CommonPreprocessor());
List<String> tokens = t.create(sentence).getTokens();
double[] tfidf = new double[tokens.size()];
for (int i = 0; i < tfidf.length; i++) {
tfidf[i] = Utils.tfIdf(sentence, getSentencesFromLanguage(language), tokens.get(i));
}
Word2Vec loadedVec = languageWord2VecMap.get(language);
return tokens.stream().map(loadedVec::getWordVector).collect(Collectors.toList());
}
开发者ID:madeleine789,项目名称:dl4j-apr,代码行数:11,代码来源:Pan15Word2Vec.java
示例17: shouldLoadAndCreateSameWord2Vec
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@Test
public void shouldLoadAndCreateSameWord2Vec() {
//given
Pan15Parser parser = new Pan15Parser();
HashMap<String, Pan15Author> english = parser.parseCSVCorpus().get(Language.ENGLISH);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
List<String> englishSentences = english.values().stream().map(Author::getDocuments)
.collect(Collectors.toList())
.stream().flatMap(List::stream).collect(Collectors.toList());
SentenceIterator englishIter = new CollectionSentenceIterator(new Pan15SentencePreProcessor(), englishSentences);
// when
Word2Vec englishVec = new Word2Vec.Builder()
.minWordFrequency(6)
.iterations(15)
.layerSize(250)
.seed(42)
.windowSize(5)
.iterate(englishIter)
.tokenizerFactory(t)
.build();
englishVec.fit();
Word2Vec loadedEnglishVec1 = new Pan15Word2Vec(new SkipGram<>()).readModelFromFile(Language.ENGLISH);
Word2Vec loadedEnglishVec2 = new Pan15Word2Vec(new CBOW<>()).readModelFromFile(Language.ENGLISH);
Word2Vec loadedEnglishVec3 = new Pan15Word2Vec(new GloVe<>()).readModelFromFile(Language.ENGLISH);
loadedEnglishVec1.setTokenizerFactory(t);
loadedEnglishVec1.setSentenceIterator(englishIter);
loadedEnglishVec2.setTokenizerFactory(t);
loadedEnglishVec2.setSentenceIterator(englishIter);
loadedEnglishVec3.setTokenizerFactory(t);
loadedEnglishVec3.setSentenceIterator(englishIter);
//then
Assert.assertNotNull(loadedEnglishVec1);
System.out.println(englishVec.wordsNearest("home", 15));
System.out.println(loadedEnglishVec1.wordsNearest("home", 15));
System.out.println(loadedEnglishVec2.wordsNearest("home", 15));
System.out.println(loadedEnglishVec3.wordsNearest("home", 15));
}
开发者ID:madeleine789,项目名称:dl4j-apr,代码行数:42,代码来源:Pan15Word2VecTest.java
示例18: RottenTomatoesWordVectorDataFetcher
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
public RottenTomatoesWordVectorDataFetcher(Word2Vec vec) {
iter = new RottenTomatoesLabelAwareSentenceIterator();
this.vec = vec;
}
开发者ID:ihuerga,项目名称:deeplearning4j-nlp-examples,代码行数:6,代码来源:RottenTomatoesWordVectorDataFetcher.java
示例19: getVec
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
public Word2Vec getVec() {
return vec;
}
开发者ID:ihuerga,项目名称:deeplearning4j-nlp-examples,代码行数:4,代码来源:RottenTomatoesWordVectorDataFetcher.java
示例20: main
import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
SentenceIterator docIter = new CollectionSentenceIterator(new SentenceToPhraseMapper(new ClassPathResource("/train.tsv").getFile()).sentences());
TokenizerFactory factory = new DefaultTokenizerFactory();
Word2Vec vec = new Word2Vec.Builder().iterate(docIter).tokenizerFactory(factory).batchSize(100000)
.learningRate(2.5e-2).iterations(1)
.layerSize(100).windowSize(5).build();
vec.fit();
NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder().nIn(vec.getLayerSize()).nOut(vec.getLayerSize())
.hiddenUnit(RBM.HiddenUnit.RECTIFIED).visibleUnit(RBM.VisibleUnit.GAUSSIAN).momentum(0.5f)
.iterations(10).learningRate(1e-6f).build();
InMemoryLookupCache l = (InMemoryLookupCache) vec.getCache();
DBN d = new DBN.Builder()
.configure(conf).hiddenLayerSizes(new int[]{250,100,2})
.build();
DataSet dPretrain = new DataSet(l.getSyn0(),l.getSyn0());
DataSetIterator dPretrainIter = new ListDataSetIterator(dPretrain.asList(),1000);
while(dPretrainIter.hasNext()) {
d.pretrain(dPretrainIter.next().getFeatureMatrix(), 1, 1e-6f, 10);
}
// d.pretrain(l.getSyn0(),1,1e-3f,1000);
d.getOutputLayer().conf().setLossFunction(LossFunctions.LossFunction.RMSE_XENT);
SemanticHashing s = new SemanticHashing.Builder().withEncoder(d)
.build();
d = null;
dPretrainIter.reset();
while(dPretrainIter.hasNext()) {
s.fit(dPretrainIter.next());
}
Tsne t = new Tsne.Builder()
.setMaxIter(100).stopLyingIteration(20).build();
INDArray output = s.reconstruct(l.getSyn0(),4);
l.getSyn0().data().flush();
l.getSyn1().data().flush();
s = null;
System.out.println(Arrays.toString(output.shape()));
t.plot(output,2,new ArrayList<>(vec.getCache().words()));
vec.getCache().plotVocab(t);
}
开发者ID:ihuerga,项目名称:deeplearning4j-nlp-examples,代码行数:55,代码来源:VisualizationSemanticHashing.java
注:本文中的org.deeplearning4j.models.word2vec.Word2Vec类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论