把一篇pdf论文解析后,放入es数据库中,建立倒排索引表,并实现简单搜索。
1、pdf论文解析(英文)
安装pdf解析包
- def extract_text_from_pdf(filename, page_numbers=None, min_line_length=1):
- '''
- 从pdf文件中提取文字
- :param filename: pdf文件
- :param page_numbers: 指定页码,list
- :param min_line_length: 文本最小分隔长度
- :return:
- '''
- paragraphs = []
- buffer = ''
- full_text = ''
- # 提取全部文本
- for i, page_layout in enumerate(extract_pages(filename)):
- if page_numbers is not None and i not in page_numbers:
- continue
- for element in page_layout:
- if isinstance(element, LTTextContainer):
- full_text += element.get_text() + '\n'
- # 按空行分隔,将文本重新组织成段落
- lines = full_text.split('\n')
- for text in lines:
- if len(text) >= min_line_length:
- buffer += (' ' + text) if not text.endswith('-') else text.strip('-')
- elif buffer:
- paragraphs.append(buffer)
- buffer = ''
- if buffer:
- paragraphs.append(buffer)
- return paragraphs
复制代码 2、关键字提取
nltk安装见nltk安装与使用
- import re
- from nltk.stem import PorterStemmer
- from nltk.tokenize import word_tokenize
- from nltk.corpus import stopwords
- def keywords(text):
- '''
- 提取文本关键字(简化版)
- :param text: 一段字符串
- :return:
- '''
- # 提取所有字母数字,并替换所有非字母数字的字符为空格
- no_symbols = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
- # 分词
- word_tokens = word_tokenize(no_symbols)
- # 去停用词
- stop_words = set(stopwords.words('english'))
- filtered_words = [w for w in word_tokens if not w.lower() in stop_words]
- # 取词根
- ps = PorterStemmer()
- key_words = [ps.stem(w) for w in filtered_words]
- return ' '.join(key_words)
复制代码 3、创建Elasticsearch毗连,注意添加证书
Elasticsearch安装见elasticsearch安装与使用(1)-使用docker安装Elasticsearch
- from elasticsearch import Elasticsearch, helpers
- # 创建Elasticsearch连接
- es = Elasticsearch(
- hosts=['https://localhost:9200'], # 服务地址与端口
- basic_auth=("elastic", "N-sf6R*O0Ur344otTfzc"), # 用户名,密码
- ca_certs="/Users/sunwenjun/data/elastic8/http_ca.crt" # 证书
- )
复制代码 4、建立倒排索引库
- def add_data_to_es(index_name="test_index", text_list=["text1","text2"]):
- '''
- 建立索引,并往索引里添加数据
- :param index_name: 定义索引名称
- param paragraphs: 需要检索的文本列表
- :return:
- '''
- # 如果索引已存在,删除它
- if es.indices.exists(index=index_name):
- es.indices.delete(index=index_name)
- # 创建索引
- es.indices.create(index=index_name)
- # 灌库指令
- actions = []
- for text in text_list:
- action = {
- "_index": index_name,
- "_source": {
- "keywords": keywords(text),
- "text": text
- }
- }
- actions.append(action)
- # 文本灌库
- res = helpers.bulk(es, actions)
- return res
- # 往es里添加数据
- index_name = "index_test"
- add_data_to_es(index_name, paragraphs)
复制代码 索引库可视化见elasticsearch安装与使用(3)-索引库可视化
5、实现搜索
- def search(index_name, query, top_n=3):
- '''
- 查询
- :param index_name:
- :param query:
- :param top_n:
- :return:
- '''
- search_query = {
- "match":
- {"keywords": keywords(query)}
- }
- search_res = es.search(index=index_name, query=search_query, size=top_n)
- results = [hit["_source"]["text"] for hit in search_res["hits"]["hits"]]
- return results
- query = "retrieval "
- results = search(index_name, query, 5)
- for res in results:
- print(res)
复制代码 5、完备代码
esdemo-01
参考
无需重新学习,使用 Kibana 查询/可视化 SLS 数据
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作!更多信息从访问主页:qidao123.com:ToB企服之家,中国第一个企服评测及商务社交产业平台。 |