DB2 XML full-text search using NSE

/*
　　1)Download/Install Net Search Extender and set variable
　　2)Create table and load xml data
　　3)db2text start/Enable for text
　　4)create index for text
　　5)update index
　　6)Search Text with function Contain()
　　7)db2text stop
*/

-- DB2 是透過 Net Search Extender 對 XML 做 full-text search
-- 先到 IBM 官網去 download Net Search Extender，我的版本是9.5
-- 安裝好之後，先去設定環境變數 DB2DBDFT

直接在 CLP 使用 SET 或
「控制台」→「系統」→「進階」→「環境變數」新增系統變數 DB2DBDFT 並且將值給定為使用的 Database 名稱即可，ex: SAMPLE

※ 不設定的話，會一直出現 CTE0139 環境變數 "DB2DBDFT" 尚未設定。
※ 設定好之後，可以不必下connect database，由 db2text 內部直接做掉。

-- 建立一個測試的 table:ARCHAEOLOGY 來存放考古新聞


CREATE TABLE ARCHAEOLOGY
(
　　　SNO　　　　　INTEGER not null primary key,
　　　NEWSDATE　　CHAR(8),
　　　NEWS　　　　XML
) IN USERSPACE1;

-- XML 的 layout:


< News >
　< Headline >新聞標題< /Headline >
　< Reporter >記者< /Reporter >
　< Venue >事件發生地點< /Venue >
　< NewAgency >新聞社< /NewAgency >　
　< PublishDate >新聞發布時間< /PublishDate >
　< Content >新聞內容< /Content >
< /News >

-- 抓張圖看一下範例:

-- 直接將我自己準備的 16 則新聞以 xml 的格式儲存
-- 並做一個archaeology.del 檔，內容如下:


1,20070309,< XDS FIL='20070309_1.xml'/ >
2,20070402,< XDS FIL='20070402_1.xml'/ >
3,20070711,< XDS FIL='20070711_1.xml'/ >
4,20070712,< XDS FIL='20070712_1.xml'/ >
5,20070713,< XDS FIL='20070713_1.xml'/ >
6,20070716,< XDS FIL='20070716_1.xml'/ >
7,20070719,< XDS FIL='20070719.xml'/ >
8,20070724,< XDS FIL='20070724_2.xml'/ >
9,20070725,< XDS FIL='20070725_2.xml'/ >
10,20070727,< XDS FIL='20070727_1.xml'/ >
11,20070801,< XDS FIL='20070801_1.xml'/ >
12,20070806,< XDS FIL='20070806_1.xml'/ >
13,20070813,< XDS FIL='20070813_1.xml'/ >
14,20070814,< XDS FIL='20070814_1.xml'/ >
15,20070821,< XDS FIL='20070821_2.xml'/ >
16,20080903,< XDS FIL='20080903_1.xml'/ >

-- import 進去


import from archaeology.del of del xml from c:/ insert into archaeology

-- 用 db2text 將 Net Start Extender 叫起來


db2text start

-- 讓 NSE 對 Database SAMPLE 做 text search


db2text enable database for text connect to sample

-- 用 db2text 對 table archaeology 建 index


db2text create index archaeology_idx for text on archaeology(news)

-- 對 index 做 sync


db2text update index archaeology_idx for text

-- 使用Contain()來找資料
-- 1) 用 like 的方式從整個 xml 裡每一個 element 找包含 Alexan 字眼的文章並將新聞標題列出來


select xmlquery('$ns//Headline' passing news as "ns")
from archaeology
where contains(news,'"Alexan%"') = 1;

--　結果找到亞歷山大大帝與有關亞歷山卓城的新聞


< Headline >Alexander the Great Conquered City via Sunken Sandbar< /Headline >
< Headline >Hidden City Found Beneath Alexandria< /Headline >
< Headline >Ancient "Lost" City's Remains Found Under Alexandria's Waters< /Headline >

　　　已選取 3 個記錄。

-- 2) 從整個 xml 裡每一個 element 找包含 Tyre 字眼的文章並將新聞標題列出來


select xmlquery('$ns//Headline' passing news as "ns")
from archaeology
where contains(news,'"Tyre"') = 1;

--　結果在新聞內容中找到亞歷山大大帝征服推羅古城的新聞


< Headline >Alexander the Great Conquered City via Sunken Sandbar< /Headline >

　　　已選取 1 個記錄。

--　找包含 the 這個字的有多少新聞


select count(*)
from archaeology
where contains(news,'"the"') = 1;

--　結果是全部筆數


1
-----------
　　　　　16

　　　已選取 1 個記錄。

-- 再轉一筆今天的新聞進去
-- a1.del 內如如下


17,20080905,< XDS FIL='20080905.xml' / >

-- import


import from a1.del of del xml from c:\  insert into archaeology;

--　看一下 archaeology 總筆數


select count(*) from archaeology;

1
-----------
　　　　　17

　　　已選取 1 個記錄。

--　再找找看包含 the 這個字的有多少新聞
--(20080905.xml確定包含 "the" 這個字)


select count(*)
from archaeology
where contains(news,'"the"') = 1;

--　結果還是 16 筆


1
-----------
　　　　　16

　　　已選取 1 個記錄。

--　原因是 index 要做 sync


db2text update index archaeology_idx for text

--　再找找看包含 the 這個字的有多少新聞


select count(*)
from archaeology
where contains(news,'"the"') = 1;

--　結果就是 17 筆了


1
-----------
　　　　　17

　　　已選取 1 個記錄。

--　將 db2text 服務關閉


db2text stop
CTE0001 作業順利完成。

Orion's DB Lab Diary - since 2008

2008年9月5日星期五