doc-exports/docs/css/umn/css_01_0129.html
Wuwan, Qi 050b395397 CSS UMN 23.2.1 20230926
Reviewed-by: Kacur, Michal <michal.kacur@t-systems.com>
Co-authored-by: Wuwan, Qi <wuwanqi1@noreply.gitea.eco.tsi-dev.otc-service.com>
Co-committed-by: Wuwan, Qi <wuwanqi1@noreply.gitea.eco.tsi-dev.otc-service.com>
2024-01-10 14:23:15 +00:00

329 lines
21 KiB
HTML

<a name="css_01_0129"></a><a name="css_01_0129"></a>
<h1 class="topictitle1">Sample Code for Vector Search on a Client</h1>
<div id="body0000001261749932"><p id="css_01_0129__en-us_topic_0000001261749932_p143381112172510">Elasticsearch provides standard REST APIs and clients developed using Java, Python, and Go.</p>
<p id="css_01_0129__en-us_topic_0000001261749932_p1675193372415">Based on the open-source dataset <strong id="css_01_0129__en-us_topic_0000001261749932_b215561919574">SIFT1M</strong> (http://corpus-texmex.irisa.fr/) and Python Elasticsearch client, this section provides a code snippet for creating a vector index, importing vector data, and querying vector data on the client.</p>
<div class="section" id="css_01_0129__en-us_topic_0000001261749932_section1864223419264"><h4 class="sectiontitle">Prerequisites</h4><p id="css_01_0129__en-us_topic_0000001261749932_p1299195810328">The Python dependency package has been installed on the client. If it is not installed, run the following commands to install it:</p>
<pre class="screen" id="css_01_0129__en-us_topic_0000001261749932_screen1932333123212">pip install numpy
pip install elasticsearch==7.6.0</pre>
</div>
<div class="section" id="css_01_0129__en-us_topic_0000001261749932_section10885342132619"><h4 class="sectiontitle">Sample Code</h4><div class="codecoloring" codetype="Python" id="css_01_0129__en-us_topic_0000001261749932_screen8396163917303"><table class="highlighttable"><tr><td class="linenos"><div class="linenodiv"><pre> 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155</pre></div></td><td class="code"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">time</span>
<span class="kn">import</span> <span class="nn">json</span>
<span class="kn">from</span> <span class="nn">concurrent.futures</span> <span class="kn">import</span> <span class="n">ThreadPoolExecutor</span><span class="p">,</span> <span class="n">wait</span>
<span class="kn">from</span> <span class="nn">elasticsearch</span> <span class="kn">import</span> <span class="n">Elasticsearch</span>
<span class="kn">from</span> <span class="nn">elasticsearch</span> <span class="kn">import</span> <span class="n">helpers</span>
<span class="n">endpoint</span> <span class="o">=</span> <span class="s1">'http://xxx.xxx.xxx.xxx:9200/'</span>
<span class="c1"># Construct an Elasticsearch client object</span>
<span class="n">es</span> <span class="o">=</span> <span class="n">Elasticsearch</span><span class="p">(</span><span class="n">endpoint</span><span class="p">)</span>
<span class="c1"># Index mapping information</span>
<span class="n">index_mapping</span> <span class="o">=</span> <span class="s1">'''</span>
<span class="s1">{</span>
<span class="s1"> &quot;settings&quot;: {</span>
<span class="s1"> &quot;index&quot;: {</span>
<span class="s1"> &quot;vector&quot;: &quot;true&quot;</span>
<span class="s1"> }</span>
<span class="s1"> },</span>
<span class="s1"> &quot;mappings&quot;: {</span>
<span class="s1"> &quot;properties&quot;: {</span>
<span class="s1"> &quot;my_vector&quot;: {</span>
<span class="s1"> &quot;type&quot;: &quot;vector&quot;,</span>
<span class="s1"> &quot;dimension&quot;: 128,</span>
<span class="s1"> &quot;indexing&quot;: true,</span>
<span class="s1"> &quot;algorithm&quot;: &quot;GRAPH&quot;,</span>
<span class="s1"> &quot;metric&quot;: &quot;euclidean&quot;</span>
<span class="s1"> }</span>
<span class="s1"> }</span>
<span class="s1"> }</span>
<span class="s1">}</span>
<span class="s1">'''</span>
<span class="c1"># Create an index.</span>
<span class="k">def</span> <span class="nf">create_index</span><span class="p">(</span><span class="n">index_name</span><span class="p">,</span> <span class="n">mapping</span><span class="p">):</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">es</span><span class="o">.</span><span class="n">indices</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">index_name</span><span class="p">,</span> <span class="n">ignore</span><span class="o">=</span><span class="mi">400</span><span class="p">,</span> <span class="n">body</span><span class="o">=</span><span class="n">mapping</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">res</span><span class="p">)</span>
<span class="c1"># Delete an index.</span>
<span class="k">def</span> <span class="nf">delete_index</span><span class="p">(</span><span class="n">index_name</span><span class="p">):</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">es</span><span class="o">.</span><span class="n">indices</span><span class="o">.</span><span class="n">delete</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">index_name</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">res</span><span class="p">)</span>
<span class="c1"># Refresh indexes.</span>
<span class="k">def</span> <span class="nf">refresh_index</span><span class="p">(</span><span class="n">index_name</span><span class="p">):</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">es</span><span class="o">.</span><span class="n">indices</span><span class="o">.</span><span class="n">refresh</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">index_name</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="n">res</span><span class="p">)</span>
<span class="c1"># Merge index segments.</span>
<span class="k">def</span> <span class="nf">merge_index</span><span class="p">(</span><span class="n">index_name</span><span class="p">,</span> <span class="n">seg_cnt</span><span class="o">=</span><span class="mi">1</span><span class="p">):</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="n">es</span><span class="o">.</span><span class="n">indices</span><span class="o">.</span><span class="n">forcemerge</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">index_name</span><span class="p">,</span> <span class="n">max_num_segments</span><span class="o">=</span><span class="n">seg_cnt</span><span class="p">,</span> <span class="n">request_timeout</span><span class="o">=</span><span class="mi">36000</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot; Complete the merge within {time.time() - start} seconds&quot;</span><span class="p">)</span>
<span class="c1"># Load vector data.</span>
<span class="k">def</span> <span class="nf">load_vectors</span><span class="p">(</span><span class="n">file_name</span><span class="p">):</span>
<span class="n">fv</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">fromfile</span><span class="p">(</span><span class="n">file_name</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="n">dim</span> <span class="o">=</span> <span class="n">fv</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">vectors</span> <span class="o">=</span> <span class="n">fv</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span> <span class="o">+</span> <span class="n">dim</span><span class="p">)[:,</span> <span class="mi">1</span><span class="p">:]</span>
<span class="k">return</span> <span class="n">vectors</span>
<span class="c1"># Load the ground_truth data.</span>
<span class="k">def</span> <span class="nf">load_gts</span><span class="p">(</span><span class="n">file_name</span><span class="p">):</span>
<span class="n">fv</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">fromfile</span><span class="p">(</span><span class="n">file_name</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">)</span>
<span class="n">dim</span> <span class="o">=</span> <span class="n">fv</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">int32</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">gts</span> <span class="o">=</span> <span class="n">fv</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span> <span class="o">+</span> <span class="n">dim</span><span class="p">)[:,</span> <span class="mi">1</span><span class="p">:]</span>
<span class="k">return</span> <span class="n">gts</span>
<span class="k">def</span> <span class="nf">partition</span><span class="p">(</span><span class="n">ls</span><span class="p">,</span> <span class="n">size</span><span class="p">):</span>
<span class="k">return</span> <span class="p">[</span><span class="n">ls</span><span class="p">[</span><span class="n">i</span><span class="p">:</span><span class="n">i</span> <span class="o">+</span> <span class="n">size</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">ls</span><span class="p">),</span> <span class="n">size</span><span class="p">)]</span>
<span class="c1"># Write vector data.</span>
<span class="k">def</span> <span class="nf">write_index</span><span class="p">(</span><span class="n">index_name</span><span class="p">,</span> <span class="n">vec_file</span><span class="p">):</span>
<span class="n">pool</span> <span class="o">=</span> <span class="n">ThreadPoolExecutor</span><span class="p">(</span><span class="n">max_workers</span><span class="o">=</span><span class="mi">8</span><span class="p">)</span>
<span class="n">tasks</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">vectors</span> <span class="o">=</span> <span class="n">load_vectors</span><span class="p">(</span><span class="n">vec_file</span><span class="p">)</span>
<span class="n">bulk_size</span> <span class="o">=</span> <span class="mi">1000</span>
<span class="n">partitions</span> <span class="o">=</span> <span class="n">partition</span><span class="p">(</span><span class="n">vectors</span><span class="p">,</span> <span class="n">bulk_size</span><span class="p">)</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="n">start_id</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">for</span> <span class="n">vecs</span> <span class="ow">in</span> <span class="n">partitions</span><span class="p">:</span>
<span class="n">tasks</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">pool</span><span class="o">.</span><span class="n">submit</span><span class="p">(</span><span class="n">write_bulk</span><span class="p">,</span> <span class="n">index_name</span><span class="p">,</span> <span class="n">vecs</span><span class="p">,</span> <span class="n">start_id</span><span class="p">))</span>
<span class="n">start_id</span> <span class="o">+=</span> <span class="nb">len</span><span class="p">(</span><span class="n">vecs</span><span class="p">)</span>
<span class="n">wait</span><span class="p">(</span><span class="n">tasks</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot; Complete the writing within {time.time() - start} seconds&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">write_bulk</span><span class="p">(</span><span class="n">index_name</span><span class="p">,</span> <span class="n">vecs</span><span class="p">,</span> <span class="n">start_id</span><span class="p">):</span>
<span class="n">actions</span> <span class="o">=</span> <span class="p">[</span>
<span class="p">{</span>
<span class="s2">&quot;_index&quot;</span><span class="p">:</span> <span class="n">index_name</span><span class="p">,</span>
<span class="s2">&quot;my_vector&quot;</span><span class="p">:</span> <span class="n">vecs</span><span class="p">[</span><span class="n">j</span><span class="p">]</span><span class="o">.</span><span class="n">tolist</span><span class="p">(),</span>
<span class="s2">&quot;_id&quot;</span><span class="p">:</span> <span class="nb">str</span><span class="p">(</span><span class="n">j</span> <span class="o">+</span> <span class="n">start_id</span><span class="p">)</span>
<span class="p">}</span>
<span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">vecs</span><span class="p">))</span>
<span class="p">]</span>
<span class="n">helpers</span><span class="o">.</span><span class="n">bulk</span><span class="p">(</span><span class="n">es</span><span class="p">,</span> <span class="n">actions</span><span class="p">,</span> <span class="n">request_timeout</span><span class="o">=</span><span class="mi">3600</span><span class="p">)</span>
<span class="c1"># Query an index.</span>
<span class="k">def</span> <span class="nf">search_index</span><span class="p">(</span><span class="n">index_name</span><span class="p">,</span> <span class="n">query_file</span><span class="p">,</span> <span class="n">gt_file</span><span class="p">,</span> <span class="n">k</span><span class="p">):</span>
<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Start query! Index name: &quot;</span> <span class="o">+</span> <span class="n">index_name</span><span class="p">)</span>
<span class="n">queries</span> <span class="o">=</span> <span class="n">load_vectors</span><span class="p">(</span><span class="n">query_file</span><span class="p">)</span>
<span class="n">gt</span> <span class="o">=</span> <span class="n">load_gts</span><span class="p">(</span><span class="n">gt_file</span><span class="p">)</span>
<span class="n">took</span> <span class="o">=</span> <span class="mi">0</span>
<span class="n">precision</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">idx</span><span class="p">,</span> <span class="n">query</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">queries</span><span class="p">):</span>
<span class="n">hits</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="n">query_json</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">&quot;size&quot;</span><span class="p">:</span> <span class="n">k</span><span class="p">,</span>
<span class="s2">&quot;_source&quot;</span><span class="p">:</span> <span class="kc">False</span><span class="p">,</span>
<span class="s2">&quot;query&quot;</span><span class="p">:</span> <span class="p">{</span>
<span class="s2">&quot;vector&quot;</span><span class="p">:</span> <span class="p">{</span>
<span class="s2">&quot;my_vector&quot;</span><span class="p">:</span> <span class="p">{</span>
<span class="s2">&quot;vector&quot;</span><span class="p">:</span> <span class="n">query</span><span class="o">.</span><span class="n">tolist</span><span class="p">(),</span>
<span class="s2">&quot;topk&quot;</span><span class="p">:</span> <span class="n">k</span>
<span class="p">}</span>
<span class="p">}</span>
<span class="p">}</span>
<span class="p">}</span>
<span class="n">res</span> <span class="o">=</span> <span class="n">es</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">index_name</span><span class="p">,</span> <span class="n">body</span><span class="o">=</span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">query_json</span><span class="p">))</span>
<span class="k">for</span> <span class="n">hit</span> <span class="ow">in</span> <span class="n">res</span><span class="p">[</span><span class="s1">'hits'</span><span class="p">][</span><span class="s1">'hits'</span><span class="p">]:</span>
<span class="n">hits</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">hit</span><span class="p">[</span><span class="s1">'_id'</span><span class="p">]))</span>
<span class="n">precision</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">hits</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">gt</span><span class="p">[</span><span class="n">idx</span><span class="p">,</span> <span class="p">:</span><span class="n">k</span><span class="p">])))</span> <span class="o">/</span> <span class="n">k</span><span class="p">)</span>
<span class="n">took</span> <span class="o">+=</span> <span class="n">res</span><span class="p">[</span><span class="s1">'took'</span><span class="p">]</span>
<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;precision: &quot;</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="nb">sum</span><span class="p">(</span><span class="n">precision</span><span class="p">)</span> <span class="o">/</span> <span class="nb">len</span><span class="p">(</span><span class="n">precision</span><span class="p">)))</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot; Complete the retrieval within {took / 1000:.2f} seconds; average took size is {took / len(queries):.2f} ms&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">vec_file</span> <span class="o">=</span> <span class="sa">r</span><span class="s2">&quot;./data/sift/sift_base.fvecs&quot;</span>
<span class="n">qry_file</span> <span class="o">=</span> <span class="sa">r</span><span class="s2">&quot;./data/sift/sift_query.fvecs&quot;</span>
<span class="n">gt_file</span> <span class="o">=</span> <span class="sa">r</span><span class="s2">&quot;./data/sift/sift_groundtruth.ivecs&quot;</span>
<span class="n">index</span> <span class="o">=</span> <span class="s2">&quot;test&quot;</span>
<span class="n">create_index</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="n">index_mapping</span><span class="p">)</span>
<span class="n">write_index</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="n">vec_file</span><span class="p">)</span>
<span class="n">merge_index</span><span class="p">(</span><span class="n">index</span><span class="p">)</span>
<span class="n">refresh_index</span><span class="p">(</span><span class="n">index</span><span class="p">)</span>
<span class="n">search_index</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="n">qry_file</span><span class="p">,</span> <span class="n">gt_file</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span>
</pre></div>
</td></tr></table></div>
</div>
</div>
<div>
<div class="familylinks">
<div class="parentlink"><strong>Parent topic:</strong> <a href="css_01_0117.html">Vector Retrieval</a></div>
</div>
</div>