forked from laiweijian4/doc-exports
Reviewed-by: Kacur, Michal <michal.kacur@t-systems.com> Co-authored-by: Wuwan, Qi <wuwanqi1@noreply.gitea.eco.tsi-dev.otc-service.com> Co-committed-by: Wuwan, Qi <wuwanqi1@noreply.gitea.eco.tsi-dev.otc-service.com>
329 lines
21 KiB
HTML
329 lines
21 KiB
HTML
<a name="css_01_0129"></a><a name="css_01_0129"></a>
|
|
|
|
<h1 class="topictitle1">Sample Code for Vector Search on a Client</h1>
|
|
<div id="body0000001261749932"><p id="css_01_0129__en-us_topic_0000001261749932_p143381112172510">Elasticsearch provides standard REST APIs and clients developed using Java, Python, and Go.</p>
|
|
<p id="css_01_0129__en-us_topic_0000001261749932_p1675193372415">Based on the open-source dataset <strong id="css_01_0129__en-us_topic_0000001261749932_b215561919574">SIFT1M</strong> (http://corpus-texmex.irisa.fr/) and Python Elasticsearch client, this section provides a code snippet for creating a vector index, importing vector data, and querying vector data on the client.</p>
|
|
<div class="section" id="css_01_0129__en-us_topic_0000001261749932_section1864223419264"><h4 class="sectiontitle">Prerequisites</h4><p id="css_01_0129__en-us_topic_0000001261749932_p1299195810328">The Python dependency package has been installed on the client. If it is not installed, run the following commands to install it:</p>
|
|
<pre class="screen" id="css_01_0129__en-us_topic_0000001261749932_screen1932333123212">pip install numpy
|
|
pip install elasticsearch==7.6.0</pre>
|
|
</div>
|
|
<div class="section" id="css_01_0129__en-us_topic_0000001261749932_section10885342132619"><h4 class="sectiontitle">Sample Code</h4><div class="codecoloring" codetype="Python" id="css_01_0129__en-us_topic_0000001261749932_screen8396163917303"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre> 1
|
|
2
|
|
3
|
|
4
|
|
5
|
|
6
|
|
7
|
|
8
|
|
9
|
|
10
|
|
11
|
|
12
|
|
13
|
|
14
|
|
15
|
|
16
|
|
17
|
|
18
|
|
19
|
|
20
|
|
21
|
|
22
|
|
23
|
|
24
|
|
25
|
|
26
|
|
27
|
|
28
|
|
29
|
|
30
|
|
31
|
|
32
|
|
33
|
|
34
|
|
35
|
|
36
|
|
37
|
|
38
|
|
39
|
|
40
|
|
41
|
|
42
|
|
43
|
|
44
|
|
45
|
|
46
|
|
47
|
|
48
|
|
49
|
|
50
|
|
51
|
|
52
|
|
53
|
|
54
|
|
55
|
|
56
|
|
57
|
|
58
|
|
59
|
|
60
|
|
61
|
|
62
|
|
63
|
|
64
|
|
65
|
|
66
|
|
67
|
|
68
|
|
69
|
|
70
|
|
71
|
|
72
|
|
73
|
|
74
|
|
75
|
|
76
|
|
77
|
|
78
|
|
79
|
|
80
|
|
81
|
|
82
|
|
83
|
|
84
|
|
85
|
|
86
|
|
87
|
|
88
|
|
89
|
|
90
|
|
91
|
|
92
|
|
93
|
|
94
|
|
95
|
|
96
|
|
97
|
|
98
|
|
99
|
|
100
|
|
101
|
|
102
|
|
103
|
|
104
|
|
105
|
|
106
|
|
107
|
|
108
|
|
109
|
|
110
|
|
111
|
|
112
|
|
113
|
|
114
|
|
115
|
|
116
|
|
117
|
|
118
|
|
119
|
|
120
|
|
121
|
|
122
|
|
123
|
|
124
|
|
125
|
|
126
|
|
127
|
|
128
|
|
129
|
|
130
|
|
131
|
|
132
|
|
133
|
|
134
|
|
135
|
|
136
|
|
137
|
|
138
|
|
139
|
|
140
|
|
141
|
|
142
|
|
143
|
|
144
|
|
145
|
|
146
|
|
147
|
|
148
|
|
149
|
|
150
|
|
151
|
|
152
|
|
153
|
|
154
|
|
155</pre></div></td><td class="code"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
|
|
<span class="kn">import</span> <span class="nn">time</span>
|
|
<span class="kn">import</span> <span class="nn">json</span>
|
|
|
|
<span class="kn">from</span> <span class="nn">concurrent.futures</span> <span class="kn">import</span> <span class="n">ThreadPoolExecutor</span><span class="p">,</span> <span class="n">wait</span>
|
|
<span class="kn">from</span> <span class="nn">elasticsearch</span> <span class="kn">import</span> <span class="n">Elasticsearch</span>
|
|
<span class="kn">from</span> <span class="nn">elasticsearch</span> <span class="kn">import</span> <span class="n">helpers</span>
|
|
|
|
<span class="n">endpoint</span> <span class="o">=</span> <span class="s1">'http://xxx.xxx.xxx.xxx:9200/'</span>
|
|
|
|
<span class="c1"># Construct an Elasticsearch client object</span>
|
|
<span class="n">es</span> <span class="o">=</span> <span class="n">Elasticsearch</span><span class="p">(</span><span class="n">endpoint</span><span class="p">)</span>
|
|
|
|
<span class="c1"># Index mapping information</span>
|
|
<span class="n">index_mapping</span> <span class="o">=</span> <span class="s1">'''</span>
|
|
<span class="s1">{</span>
|
|
<span class="s1"> "settings": {</span>
|
|
<span class="s1"> "index": {</span>
|
|
<span class="s1"> "vector": "true"</span>
|
|
<span class="s1"> }</span>
|
|
<span class="s1"> },</span>
|
|
<span class="s1"> "mappings": {</span>
|
|
<span class="s1"> "properties": {</span>
|
|
<span class="s1"> "my_vector": {</span>
|
|
<span class="s1"> "type": "vector",</span>
|
|
<span class="s1"> "dimension": 128,</span>
|
|
<span class="s1"> "indexing": true,</span>
|
|
<span class="s1"> "algorithm": "GRAPH",</span>
|
|
<span class="s1"> "metric": "euclidean"</span>
|
|
<span class="s1"> }</span>
|
|
<span class="s1"> }</span>
|
|
<span class="s1"> }</span>
|
|
<span class="s1">}</span>
|
|
<span class="s1">'''</span>
|
|
|
|
<span class="c1"># Create an index.</span>
|
|
<span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">index_name</span><span class="p">,</span> <span class="n">mapping</span><span class="p">):</span>
|
|
<span class="n">res</span> <span class="o">=</span> <span class="n">es</span><span class="o">.</span><span class="n">indices</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">index_name</span><span class="p">,</span> <span class="n">ignore</span><span class="o">=</span><span class="mi">400</span><span class="p">,</span> <span class="n">body</span><span class="o">=</span><span class="n">mapping</span><span class="p">)</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="n">res</span><span class="p">)</span>
|
|
|
|
<span class="c1"># Delete an index.</span>
|
|
<span class="k">def</span> <span class="nf">delete_index</span><span class="p">(</span><span class="n">index_name</span><span class="p">):</span>
|
|
<span class="n">res</span> <span class="o">=</span> <span class="n">es</span><span class="o">.</span><span class="n">indices</span><span class="o">.</span><span class="n">delete</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">index_name</span><span class="p">)</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="n">res</span><span class="p">)</span>
|
|
|
|
|
|
<span class="c1"># Refresh indexes.</span>
|
|
<span class="k">def</span> <span class="nf">refresh_index</span><span class="p">(</span><span class="n">index_name</span><span class="p">):</span>
|
|
<span class="n">res</span> <span class="o">=</span> <span class="n">es</span><span class="o">.</span><span class="n">indices</span><span class="o">.</span><span class="n">refresh</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">index_name</span><span class="p">)</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="n">res</span><span class="p">)</span>
|
|
|
|
|
|
<span class="c1"># Merge index segments.</span>
|
|
<span class="k">def</span> <span class="nf">merge_index</span><span class="p">(</span><span class="n">index_name</span><span class="p">,</span> <span class="n">seg_cnt</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span>
|
|
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
|
|
<span class="n">es</span><span class="o">.</span><span class="n">indices</span><span class="o">.</span><span class="n">forcemerge</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">index_name</span><span class="p">,</span> <span class="n">max_num_segments</span><span class="o">=</span><span class="n">seg_cnt</span><span class="p">,</span> <span class="n">request_timeout</span><span class="o">=</span><span class="mi">36000</span><span class="p">)</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">" Complete the merge within {time.time() - start} seconds"</span><span class="p">)</span>
|
|
|
|
|
|
<span class="c1"># Load vector data.</span>
|
|
<span class="k">def</span> <span class="nf">load_vectors</span><span class="p">(</span><span class="n">file_name</span><span class="p">):</span>
|
|
<span class="n">fv</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">fromfile</span><span class="p">(</span><span class="n">file_name</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
|
|
<span class="n">dim</span> <span class="o">=</span> <span class="n">fv</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
|
|
<span class="n">vectors</span> <span class="o">=</span> <span class="n">fv</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span> <span class="o">+</span> <span class="n">dim</span><span class="p">)[:,</span> <span class="mi">1</span><span class="p">:]</span>
|
|
<span class="k">return</span> <span class="n">vectors</span>
|
|
|
|
|
|
<span class="c1"># Load the ground_truth data.</span>
|
|
<span class="k">def</span> <span class="nf">load_gts</span><span class="p">(</span><span class="n">file_name</span><span class="p">):</span>
|
|
<span class="n">fv</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">fromfile</span><span class="p">(</span><span class="n">file_name</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">)</span>
|
|
<span class="n">dim</span> <span class="o">=</span> <span class="n">fv</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
|
|
<span class="n">gts</span> <span class="o">=</span> <span class="n">fv</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span> <span class="o">+</span> <span class="n">dim</span><span class="p">)[:,</span> <span class="mi">1</span><span class="p">:]</span>
|
|
<span class="k">return</span> <span class="n">gts</span>
|
|
|
|
|
|
<span class="k">def</span> <span class="nf">partition</span><span class="p">(</span><span class="n">ls</span><span class="p">,</span> <span class="n">size</span><span class="p">):</span>
|
|
<span class="k">return</span> <span class="p">[</span><span class="n">ls</span><span class="p">[</span><span class="n">i</span><span class="p">:</span><span class="n">i</span> <span class="o">+</span> <span class="n">size</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">ls</span><span class="p">),</span> <span class="n">size</span><span class="p">)]</span>
|
|
|
|
|
|
<span class="c1"># Write vector data.</span>
|
|
<span class="k">def</span> <span class="nf">write_index</span><span class="p">(</span><span class="n">index_name</span><span class="p">,</span> <span class="n">vec_file</span><span class="p">):</span>
|
|
<span class="n">pool</span> <span class="o">=</span> <span class="n">ThreadPoolExecutor</span><span class="p">(</span><span class="n">max_workers</span><span class="o">=</span><span class="mi">8</span><span class="p">)</span>
|
|
<span class="n">tasks</span> <span class="o">=</span> <span class="p">[]</span>
|
|
|
|
<span class="n">vectors</span> <span class="o">=</span> <span class="n">load_vectors</span><span class="p">(</span><span class="n">vec_file</span><span class="p">)</span>
|
|
<span class="n">bulk_size</span> <span class="o">=</span> <span class="mi">1000</span>
|
|
<span class="n">partitions</span> <span class="o">=</span> <span class="n">partition</span><span class="p">(</span><span class="n">vectors</span><span class="p">,</span> <span class="n">bulk_size</span><span class="p">)</span>
|
|
|
|
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
|
|
<span class="n">start_id</span> <span class="o">=</span> <span class="mi">0</span>
|
|
<span class="k">for</span> <span class="n">vecs</span> <span class="ow">in</span> <span class="n">partitions</span><span class="p">:</span>
|
|
<span class="n">tasks</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">pool</span><span class="o">.</span><span class="n">submit</span><span class="p">(</span><span class="n">write_bulk</span><span class="p">,</span> <span class="n">index_name</span><span class="p">,</span> <span class="n">vecs</span><span class="p">,</span> <span class="n">start_id</span><span class="p">))</span>
|
|
<span class="n">start_id</span> <span class="o">+=</span> <span class="nb">len</span><span class="p">(</span><span class="n">vecs</span><span class="p">)</span>
|
|
<span class="n">wait</span><span class="p">(</span><span class="n">tasks</span><span class="p">)</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">" Complete the writing within {time.time() - start} seconds"</span><span class="p">)</span>
|
|
|
|
|
|
<span class="k">def</span> <span class="nf">write_bulk</span><span class="p">(</span><span class="n">index_name</span><span class="p">,</span> <span class="n">vecs</span><span class="p">,</span> <span class="n">start_id</span><span class="p">):</span>
|
|
<span class="n">actions</span> <span class="o">=</span> <span class="p">[</span>
|
|
<span class="p">{</span>
|
|
<span class="s2">"_index"</span><span class="p">:</span> <span class="n">index_name</span><span class="p">,</span>
|
|
<span class="s2">"my_vector"</span><span class="p">:</span> <span class="n">vecs</span><span class="p">[</span><span class="n">j</span><span class="p">]</span><span class="o">.</span><span class="n">tolist</span><span class="p">(),</span>
|
|
<span class="s2">"_id"</span><span class="p">:</span> <span class="nb">str</span><span class="p">(</span><span class="n">j</span> <span class="o">+</span> <span class="n">start_id</span><span class="p">)</span>
|
|
<span class="p">}</span>
|
|
<span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">vecs</span><span class="p">))</span>
|
|
<span class="p">]</span>
|
|
<span class="n">helpers</span><span class="o">.</span><span class="n">bulk</span><span class="p">(</span><span class="n">es</span><span class="p">,</span> <span class="n">actions</span><span class="p">,</span> <span class="n">request_timeout</span><span class="o">=</span><span class="mi">3600</span><span class="p">)</span>
|
|
|
|
|
|
<span class="c1"># Query an index.</span>
|
|
<span class="k">def</span> <span class="nf">search_index</span><span class="p">(</span><span class="n">index_name</span><span class="p">,</span> <span class="n">query_file</span><span class="p">,</span> <span class="n">gt_file</span><span class="p">,</span> <span class="n">k</span><span class="p">):</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="s2">"Start query! Index name: "</span> <span class="o">+</span> <span class="n">index_name</span><span class="p">)</span>
|
|
|
|
<span class="n">queries</span> <span class="o">=</span> <span class="n">load_vectors</span><span class="p">(</span><span class="n">query_file</span><span class="p">)</span>
|
|
<span class="n">gt</span> <span class="o">=</span> <span class="n">load_gts</span><span class="p">(</span><span class="n">gt_file</span><span class="p">)</span>
|
|
|
|
<span class="n">took</span> <span class="o">=</span> <span class="mi">0</span>
|
|
<span class="n">precision</span> <span class="o">=</span> <span class="p">[]</span>
|
|
<span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">query</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">queries</span><span class="p">):</span>
|
|
<span class="n">hits</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
|
|
<span class="n">query_json</span> <span class="o">=</span> <span class="p">{</span>
|
|
<span class="s2">"size"</span><span class="p">:</span> <span class="n">k</span><span class="p">,</span>
|
|
<span class="s2">"_source"</span><span class="p">:</span> <span class="kc">False</span><span class="p">,</span>
|
|
<span class="s2">"query"</span><span class="p">:</span> <span class="p">{</span>
|
|
<span class="s2">"vector"</span><span class="p">:</span> <span class="p">{</span>
|
|
<span class="s2">"my_vector"</span><span class="p">:</span> <span class="p">{</span>
|
|
<span class="s2">"vector"</span><span class="p">:</span> <span class="n">query</span><span class="o">.</span><span class="n">tolist</span><span class="p">(),</span>
|
|
<span class="s2">"topk"</span><span class="p">:</span> <span class="n">k</span>
|
|
<span class="p">}</span>
|
|
<span class="p">}</span>
|
|
<span class="p">}</span>
|
|
<span class="p">}</span>
|
|
<span class="n">res</span> <span class="o">=</span> <span class="n">es</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">index_name</span><span class="p">,</span> <span class="n">body</span><span class="o">=</span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">query_json</span><span class="p">))</span>
|
|
|
|
<span class="k">for</span> <span class="n">hit</span> <span class="ow">in</span> <span class="n">res</span><span class="p">[</span><span class="s1">'hits'</span><span class="p">][</span><span class="s1">'hits'</span><span class="p">]:</span>
|
|
<span class="n">hits</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">hit</span><span class="p">[</span><span class="s1">'_id'</span><span class="p">]))</span>
|
|
<span class="n">precision</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">hits</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">gt</span><span class="p">[</span><span class="n">idx</span><span class="p">,</span> <span class="p">:</span><span class="n">k</span><span class="p">])))</span> <span class="o">/</span> <span class="n">k</span><span class="p">)</span>
|
|
<span class="n">took</span> <span class="o">+=</span> <span class="n">res</span><span class="p">[</span><span class="s1">'took'</span><span class="p">]</span>
|
|
|
|
<span class="nb">print</span><span class="p">(</span><span class="s2">"precision: "</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="nb">sum</span><span class="p">(</span><span class="n">precision</span><span class="p">)</span> <span class="o">/</span> <span class="nb">len</span><span class="p">(</span><span class="n">precision</span><span class="p">)))</span>
|
|
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">" Complete the retrieval within {took / 1000:.2f} seconds; average took size is {took / len(queries):.2f} ms"</span><span class="p">)</span>
|
|
|
|
|
|
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span>
|
|
<span class="n">vec_file</span> <span class="o">=</span> <span class="sa">r</span><span class="s2">"./data/sift/sift_base.fvecs"</span>
|
|
<span class="n">qry_file</span> <span class="o">=</span> <span class="sa">r</span><span class="s2">"./data/sift/sift_query.fvecs"</span>
|
|
<span class="n">gt_file</span> <span class="o">=</span> <span class="sa">r</span><span class="s2">"./data/sift/sift_groundtruth.ivecs"</span>
|
|
|
|
<span class="n">index</span> <span class="o">=</span> <span class="s2">"test"</span>
|
|
<span class="n">create_index</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="n">index_mapping</span><span class="p">)</span>
|
|
<span class="n">write_index</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="n">vec_file</span><span class="p">)</span>
|
|
<span class="n">merge_index</span><span class="p">(</span><span class="n">index</span><span class="p">)</span>
|
|
<span class="n">refresh_index</span><span class="p">(</span><span class="n">index</span><span class="p">)</span>
|
|
|
|
<span class="n">search_index</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="n">qry_file</span><span class="p">,</span> <span class="n">gt_file</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span>
|
|
</pre></div>
|
|
</td></tr></table></div>
|
|
</div>
|
|
</div>
|
|
<div>
|
|
<div class="familylinks">
|
|
<div class="parentlink"><strong>Parent topic:</strong> <a href="css_01_0117.html">Vector Retrieval</a></div>
|
|
</div>
|
|
</div>
|
|
|