标签 Elasticsearch 下的文章

Elasticsearch Search相关

作者: herefree
时间: 2020-03-25
分类: 大数据组件
8079 条评论

Search API

指定查询的索引

2020-03-25 15-50-32.png

一、URI Search（用的不多）

在URI中使用查询参数

2020-03-25 16-01-52.png

2020-03-25 16-09-16.png

2020-03-25 16-10-31.png

2020-03-25 16-12-11.png

2020-03-25 16-13-30.png

二、Request Body Search

官方文档

使用Elasticsearch提供的，基于JSON格式的更加完备的Query Domain Specific Language（DSL）

2020-03-25 15-58-06.png

#ignore_unavailable=true，可以忽略尝试访问不存在的索引“404_idx”导致的报错
POST /movies,404_idx/_search?ignore_unavailable=true
{
  "profile": true,
    "query": {
        "match_all": {}
    }
}
#查询movies分页
POST /kibana_sample_data_ecommerce/_search
{
  "from":10,
  "size":20,
  "query":{
    "match_all": {}
  }
}


#对日期排序
POST kibana_sample_data_ecommerce/_search
{
  "sort":[{"order_date":"desc"}],
  "query":{
    "match_all": {}
  }

}

#source filtering
POST kibana_sample_data_ecommerce/_search
{
  "_source":["order_date"],
  "query":{
    "match_all": {}
  }
}


#脚本字段
GET kibana_sample_data_ecommerce/_search
{
  "script_fields": {
    "new_field": {
      "script": {
        "lang": "painless",
        "source": "doc['order_date'].value+'hello'"
      }
    }
  },
  "query": {
    "match_all": {}
  }
}

#match查询
POST movies/_search
{
  "query": {
    "match": {
      "title": "last christmas"
    }
  }
}

POST movies/_search
{
  "query": {
    "match": {
      "title": {
        "query": "last christmas",
        "operator": "and" #and表示query的两个词必须同时出现
      }
    }
  }
}
#短语查询 Match Phrase
POST movies/_search
{
  "query": {
    "match_phrase": {
      "title":{
        "query": "one love"
      }
    }
  }
}

POST movies/_search
{
  "query": {
    "match_phrase": {
      "title":{
        "query": "one love",
        "slop": 1  #one 与love之间可以存在其他词
      }
    }
  }
}

POST /products/_bulk
{ "index": { "_id": 1 }}
{ "price" : 10,"avaliable":true,"date":"2018-01-01", "productID" : "XHDK-A-1293-#fJ3" }
{ "index": { "_id": 2 }}
{ "price" : 20,"avaliable":true,"date":"2019-01-01", "productID" : "KDKE-B-9947-#kL5" }
{ "index": { "_id": 3 }}
{ "price" : 30,"avaliable":true, "productID" : "JODL-X-1937-#pV7" }
{ "index": { "_id": 4 }}
{ "price" : 30,"avaliable":false, "productID" : "QQPX-R-3956-#aD8" }



#基本语法
POST /products/_search
{
  "query": {
    "bool" : {
      "must" : {
        "term" : { "price" : "30" }
      },
      "filter": {
        "term" : { "avaliable" : "true" }
      },
      "must_not" : {
        "range" : {
          "price" : { "lte" : 10 }
        }
      },
      "should" : [
        { "term" : { "productID.keyword" : "JODL-X-1937-#pV7" } },
        { "term" : { "productID.keyword" : "XHDK-A-1293-#fJ3" } }
      ],
      "minimum_should_match" :1
    }
  }
}

#改变数据模型，增加字段。解决数组包含而不是精确匹配的问题
POST /newmovies/_bulk
{ "index": { "_id": 1 }}
{ "title" : "Father of the Bridge Part II","year":1995, "genre":"Comedy","genre_count":1 }
{ "index": { "_id": 2 }}
{ "title" : "Dave","year":1993,"genre":["Comedy","Romance"],"genre_count":2 }

#must，有算分
POST /newmovies/_search
{
  "query": {
    "bool": {
      "must": [
        {"term": {"genre.keyword": {"value": "Comedy"}}},
        {"term": {"genre_count": {"value": 1}}}

      ]
    }
  }
}

#Filter。不参与算分，结果的score是0
POST /newmovies/_search
{
  "query": {
    "bool": {
      "filter": [
        {"term": {"genre.keyword": {"value": "Comedy"}}},
        {"term": {"genre_count": {"value": 1}}}
        ]

    }
  }
}


#Filtering Context
POST _search
{
  "query": {
    "bool" : {

      "filter": {
        "term" : { "avaliable" : "true" }
      },
      "must_not" : {
        "range" : {
          "price" : { "lte" : 10 }
        }
      }
    }
  }
}


#Query Context
POST /products/_bulk
{ "index": { "_id": 1 }}
{ "price" : 10,"avaliable":true,"date":"2018-01-01", "productID" : "XHDK-A-1293-#fJ3" }
{ "index": { "_id": 2 }}
{ "price" : 20,"avaliable":true,"date":"2019-01-01", "productID" : "KDKE-B-9947-#kL5" }
{ "index": { "_id": 3 }}
{ "price" : 30,"avaliable":true, "productID" : "JODL-X-1937-#pV7" }
{ "index": { "_id": 4 }}
{ "price" : 30,"avaliable":false, "productID" : "QQPX-R-3956-#aD8" }


POST /products/_search
{
  "query": {
    "bool": {
      "should": [
        {
          "term": {
            "productID.keyword": {
              "value": "JODL-X-1937-#pV7"}}
        },
        {"term": {"avaliable": {"value": true}}
        }
      ]
    }
  }
}


#嵌套，实现了 should not 逻辑
POST /products/_search
{
  "query": {
    "bool": {
      "must": {
        "term": {
          "price": "30"
        }
      },
      "should": [
        {
          "bool": {
            "must_not": {
              "term": {
                "avaliable": "false"
              }
            }
          }
        }
      ],
      "minimum_should_match": 1
    }
  }
}


#Controll the Precision
#range范围查询
POST _search
{
  "query": {
    "bool" : {
      "must" : {
        "term" : { "price" : "30" }
      },
      "filter": {
        "term" : { "avaliable" : "true" }
      },
      "must_not" : {
        "range" : {
          "price" : { "lte" : 10 }
        }
      },
      "should" : [
        { "term" : { "productID.keyword" : "JODL-X-1937-#pV7" } },
        { "term" : { "productID.keyword" : "XHDK-A-1293-#fJ3" } }
      ],
      "minimum_should_match" :2
    }
  }
}



POST /animals/_search
{
  "query": {
    "bool": {
      "should": [
        { "term": { "text": "brown" }},
        { "term": { "text": "red" }},
        { "term": { "text": "quick"   }},
        { "term": { "text": "dog"   }}
      ]
    }
  }
}

POST /animals/_search
{
  "query": {
    "bool": {
      "should": [
        { "term": { "text": "quick" }},
        { "term": { "text": "dog"   }},
        {
          "bool":{
            "should":[
               { "term": { "text": "brown" }},
                 { "term": { "text": "brown" }},
            ]
          }

        }
      ]
    }
  }
}


DELETE blogs
POST /blogs/_bulk
{ "index": { "_id": 1 }}
{"title":"Apple iPad", "content":"Apple iPad,Apple iPad" }
{ "index": { "_id": 2 }}
{"title":"Apple iPad,Apple iPad", "content":"Apple iPad" }

#boost
# boost >1 打分相关度相对性提升
#0<boost<1 打分相关度相对性降低
#boost<0 贡献负分
POST blogs/_search
{
  "query": {
    "bool": {
      "should": [
        {"match": {
          "title": {
            "query": "apple,ipad",
            "boost": 1.1
          }
        }},

        {"match": {
          "content": {
            "query": "apple,ipad"
          }
        }}
      ]
    }
  }
}

查询结果

2020-03-25 15-59-0.png

参考极客时间Elasticsearch课程

Python调用Elasticsearch相关

作者: herefree
时间: 2019-12-03
分类: 大数据组件
16017 条评论

一、安装

pip install elasticsearch

二、一个小封装类

#索引类
class ElasticSearchClient(object):
    # TODO：实例和事务化单个node，若需要多个node，需要重构代码
    def __init__(self, filepath="app/conf/conf.ini"):
        #读取es配置
        conf=configparser.ConfigParser()
        conf.read(filepath,encoding='utf-8')
        # TODO：传参

        self.es_servers = [{
            "host": conf.get('Elasticsearch','url'),
            "port": conf.get('Elasticsearch','port')
        }]
    # http_auth是对设置了安全机制的es库需要写入 账号与密码，如果没有设置则不用写这个参数
        self.es_client = elasticsearch.Elasticsearch(hosts=self.es_servers,http_auth=("xxx", "xxxxx")) 

    # TODO:进行创建一个数据库，即index
    def create_index(self, index_name):
        self.es_client.indices.create(index=index_name)
    # TODO：指定map创建一个数据库
    def createindex_by_map(self,index_name,map):
        self.es_client.indices.create(index=index_name,body=map)
    # TODO:进行删除一个数据库，即index
    def delete_es_index(self, index_name):
        self.es_client.indices.delete(index=index_name)

    # 数据库不用进入，也不用退出。


class LoadElasticSearch(object):
    # TODO:对单个index进行增删改查
    def __init__(self, index, doc_type='docx'):
        # TODO:输入单个index的名称
        self.index = index
        self.doc_type = doc_type
        try:
            self.es_client = ElasticSearchClient().es_client
        except Exception as e:
            print(e)
            print('连接es失败，请查看是否连接。')

        if not self.es_client.indices.exists(index=index):
            # 创建Index
            self.es_client.indices.create(index=self.index)

    def set_index_mapping(self, set_mappings):
        # TODO:设置mapping结构
        """
        设置index的mapping，类似于表结构。
        注意！！！！现在仅仅对mapping中的properties参数，其他的参数还很多
        前提为：已有index，并且已自定义分词器，详情见https://blog.csdn.net/u013905744/article/details/80935846
        输入参数举例说明：
            set_mappings = {"answer": {
                        "type": "string",
                        "index": "not_analyzed"
                    },
                    "answerAuthor": {
                        "type": "string"
                    },
                    "answerDate": {
                        "type": "date",
                        "format": "strict_date_optional_time||epoch_millis"//这里出现了复合类型
                    },
                    ...
                    {...
                    }
                }
        """
        mapping = {
            self.doc_type: {
                "properties": set_mappings
            }
        }
        self.es_client.indices.put_mapping(index=self.index, doc_type=self.doc_type, body=mapping)

    def add_date(self, row_obj):
        """
        单条插入ES
        """
        self.es_client.index(index=self.index, doc_type=self.doc_type, body=row_obj)

    def add_date_bulk(self, row_obj_list):
        """
        批量插入ES,输入文本格式为单条插入的list格式
        """
        load_data = []
        i = 1
        bulk_num = 2000  # 2000条为一批
        for row_obj in row_obj_list:
            action = {
                "_index": self.index,
                "_type": self.doc_type,
                "_source": row_obj
            }
            load_data.append(action)
            i += 1
            # 批量处理
            if len(load_data) == bulk_num:
                print('插入', i / bulk_num, '批数据')
                print(len(load_data))
                success, failed = bulk(self.es_client, load_data, index=self.index, raise_on_error=True)
                del load_data[0:len(load_data)]
                print(success, failed)

        if len(load_data) > 0:
            success, failed = bulk(self.es_client, load_data, index=self.index, raise_on_error=True)
            del load_data[0:len(load_data)]
            print(success, failed)

    def update_by_id(self, row_obj):
        """
        根据给定的_id,更新ES文档
        :return:
        """

        _id = row_obj.get("_id", 1)
        row_obj.pop("_id")
        self.es_client.update(index=self.index, doc_type=self.doc_type, body={"doc": row_obj}, id=_id)

    def delete_by_id(self, _id):
        """
        根据给定的id,删除文档
        :return:
        """
        self.es_client.delete(index=self.index, doc_type=self.doc_type, id=_id)

    def search_by_query(self, body):
        '''
        根据查询的query语句，来搜索查询内容
        '''
        search_result = self.es_client.search(index=self.index, doc_type=self.doc_type, body=body)
        return search_result

三、如何使用

1.创建索引时指定Mapping

我们在创建索引时，需要给创建的索引指定Mapping，我将Mapping文件放入了一个xxx.json文件中

{
  "settings": {
  #设置副本数
   "number_of_replicas": 1,
     #设置分片
   "number_of_shards": 4,
      #设置分析器 我们采用ik作为tokenizer pinyin作为filter
   "analysis": {
     "analyzer": {
       "my_analyzer":{
       "type":"custom",
       "tokenizer":"ik_max_word",
       "filter":["pinyin_first_letter_and_full_pinyin_filter"]
     }
     },
     "filter": {
       "pinyin_first_letter_and_full_pinyin_filter": {
                    "type" : "pinyin",
                    "keep_first_letter" : "true",
                    "keep_full_pinyin" : "false",
                    "keep_none_chinese" : "true",
                    "keep_original" : "false",
                    "limit_first_letter_length" : 16,
                    "lowercase" : "true",
                    "trim_whitespace" : "true",
                    "keep_none_chinese_in_first_letter" : "true"
                }
     }

   }
 },
 "mappings": {
   "dynamic_templates": [
     {
       "strings":{
           #设定读取到索引中是String类型就设置type为text字段采用我自己设置的分析器，并增加 keyword字段
         "match_mapping_type":"string", 
         "mapping":{
           "type":"text",
           "analyzer":"my_analyzer",
           "fields":{
             "raw":{
               "type":"keyword"
             }

           }
         }
       }
     }
     ]
 }
}

创建代码

mappath="xxxx/xxxx.json"
f=open(mappath,'r',encoding='utf-8')
#读取map
map=json.load(f)
es=ElasticSearchClient()
#创建索引
es.createindex_by_map(indexname,map=map)

2.查询

es_client = LoadElasticSearch(indexname)
search={"query":xxxx}
res = es_client.search_by_query(one_body)

Elasticsearch的Index Template和Dynamic Template

作者: herefree
时间: 2019-12-03
分类: 大数据组件,Elasticsearch
593 条评论

一、什么是Index Templates

帮助设定Mappings和Setting，并按照一定的规则，自动匹配到新创建的索引之上

模板仅在一个索引被新创建时，才会产生作用。修改模板不会影响已创建的索引
你可以设定多个索引模板，这些设置会被“merge”在一起
你可以指定“oder”的数值，控制“merging”的过程

文档

二、Index Template的工作方式

当一个索引被创建时

应用Elasticsearch默认的setting和mapping
应用order数值低的Index Template中的设定
应用order高的Index Template中的设定，之前的设定会被覆盖
应用创建索引时，用户所指定的Setting和Mapping，并覆盖之前模板中的设定

PUT /_template/template_test
{
    "index_patterns" : ["test*"],
    "order" : 1,
    "settings" : {
        "number_of_shards": 1,
        "number_of_replicas" : 2
    },
    "mappings" : {
        "date_detection": false,
        "numeric_detection": true
    }
}

三、什么是Dynamic Template

根据Elasticsearch识别的数据类型，结合字段名称，来动态设定字段类型

所有的字符串类型都设定成Keyword，或者关闭Keyword字段
is开头的字段都设置成boolean
long_开头的都设置成long类型

文档

四、Dynamic Template设定

Dynamic Template是定义在某个索引的Mapping中
Template有一个名称
匹配规则是一个数组
为匹配到字段设置Mapping

PUT my_index
{
  "mappings": {
    "dynamic_templates": [
            {
        "strings_as_boolean": {
          "match_mapping_type":   "string",
          "match":"is*",
          "mapping": {
            "type": "boolean"
          }
        }
      },
      {
        "strings_as_keywords": {
          "match_mapping_type":   "string",
          "mapping": {
            "type": "keyword"
          }
        }
      }
    ]
  }
}

Elasticsearch索引字段类型简介

作者: herefree
时间: 2019-11-27
分类: 大数据组件,Elasticsearch
评论

更多设置官网

字段类型设置

Index - 控制当前字段是否被索引。默认为true。如果设置成flase,该字段不可被搜索。

DELETE users
PUT users
{
    "mappings" : {
      "properties" : {
        "firstName" : {
          "type" : "text"
        },
        "lastName" : {
          "type" : "text"
        },
        "mobile" : {
          "type" : "text",
          "index": false
        }
      }
    }
}

null_value

需要对Null值实现搜索
只有Keyword类型支持设定Null_Value

PUT users
{
    "mappings" : {
      "properties" : {
        "firstName" : {
          "type" : "text"
        },
        "lastName" : {
          "type" : "text"
        },
        "mobile" : {
          "type" : "keyword",
          "null_value": "NULL"
        }
      }
    }
}

copy_to

copy_to将字段的数值拷贝到目标字段
copy_to的目标字段不出现在_source中

PUT users
{
  "mappings": {
    "properties": {
      "firstName":{
        "type": "text",
        "copy_to": "fullName"
      },
      "lastName":{
        "type": "text",
        "copy_to": "fullName"
      }
    }
  }
}
PUT users/_doc/1
{
  "firstName":"Ruan",
  "lastName": "Yiming"
}

GET users/_search?q=fullName:(Ruan Yiming)

POST users/_search
{
  "query": {
    "match": {
       "fullName":{
        "query": "Ruan Yiming",
        "operator": "and"
      }
    }
  }
}

数组类型

Elasticsearch中不提供专门的数组类型。但是任何字段，都可以包含多个相同类类型的数值。

PUT users/_doc/1
{
  "name":"onebird",
  "interests":"reading"
}

PUT users/_doc/1
{
  "name":"twobirds",
  "interests":["reading","music"]
}

多字段类型

多字段特性

精确匹配：默认给每个text字段添加keyword字段
使用不同的analyzer
- 不同语言
- pinyin字段搜索
- 支持为搜索和索引指定不同的analyzer

Exact Values v.s Full Text

Exact Value:包括数字/日期/具体一个字符串（例如“Apple store”）
- Elasticsearch中的keyword
全文本，非结构话的文本数据
- ELasticsearch中的text

full textand exact value.png

Exact Value在索引时不需要被分词

Elasticsearch Mapping设置相关

作者: herefree
时间: 2019-11-27
分类: 大数据组件,Elasticsearch
评论

一、什么是Mapping

Mapping类似数据库中的schema的定义，作用如下

定义索引中的字段的名称
定义字段的数据类型，例如字符串，数字，布尔....
字段，倒排索引的相关配置，（Analyzed or Not Analyzed，Analyzer）

Mapping会把JSON文档映射成Lucene所需要的扁平模式

二、Mapping的数据类型

1.简单类型

Text/Keyword
Date
Integer/Floating
Boolean
IPv4&IPv6

2.复杂类型-对象和嵌套对象

对象类型/嵌套类型

3.特殊类型

geo_point&geo_shape/percolartor

三、Dynamic Mapping

1.写入文档时候，如果索引不存在，会自动创建索引

2.Dynamic Mapping的机制，使得我们无需手动定义Mappings。Elasticsearch会自动根据文档信息推算出字段的类型

3.有时候会推算的不对，例如地理位置信息

4.当类型如果设置不对时，会导致一些功能无法正常运行，例如Range查询。

四、修改Mapping的字段类型

1.新增加字段

Dynamic设为true时，一旦有新增字段的文档写入，Mapping也会同事被更新
Dynamic设为false，Mapping不会被更新，新增字段的数据无法被索引，但是信息会出现在_source中
Dynamic设置成strict，文档写入失败

2.对已有字段，一旦已经有数据写入，就不在支持修改字段定义

Lucene实现的倒排索引，一旦生成后，就不允许修改

3.如果希望修改字段类型，必须Reindex API，重建索引

因为如果修改了字段的数据类型，会导致已被索引的无法被搜索
但是如果是新增加的字段，就不会有这样的影响

五、自定义Mapping的方法

1.参考API手册，纯手写

2.也可以按照以下步骤

创建一个临时的index，写入一些样本数据
通过访问Mapping API获得该临时文件的动态Mapping定义
修改后使用该配置创建自己的索引
删除临时索引