Python 调用 elasticsearch 的 bulk 接口批量插入数据出现内存泄露，导致 OOM

数据导入脚本如下

import time
import sys
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

reload(sys)
sys.setdefaultencoding('utf-8')

def set_mapping(es, index_name = "content_engine", doc_type_name = "en"):
    my_mapping = {
            "en": {
                "properties": {
                    "a": {
                        "type": "string"
                    },
                    "b": {
                        "type": "string"
                    }
                }
            }
    }
    create_index = es.indices.create(index = index_name,body = my_mapping)
    mapping_index = es.indices.put_mapping(index = index_name, doc_type = doc_type_name, body = my_mapping)
    if create_index["acknowledged"] != True or mapping_index["acknowledged"] != True:
        print "Index creation failed..."

def set_data(es, input_file, index_name = "content_engine", doc_type_name="en"):
    i = 0
    count = 0
    ACTIONS = []
    for line in open(input_file):
        fields = line.replace("\r\n", "").replace("\n", "").split("----")
        if len(fields) == 2:
            a, b = fields
        else:
            continue
        action = {
            "_index": index_name,
            "_type": doc_type_name,
            "_source": {
                  "a": a,
                  "b": b, 
            }
        }
        i += 1
        ACTIONS.append(action)
        if (i == 500000):
            success, _ = bulk(es, ACTIONS, index = index_name, raise_on_error = True)
            count += success
            i = 0
            ACTIONS = []

    success, _ = bulk(es, ACTIONS, index = index_name, raise_on_error=True)
    count += success
    print("insert %s lines" % count)


if __name__ == '__main__':
    es = Elasticsearch(hosts=["127.0.0.1:9200"], timeout=5000)
    set_mapping(es)
    set_data(es,sys.argv[1])

数据大概 5 个 G 吧，机器配置虚拟机 24G 内存，刚开始无内存泄露现象，这个 Python 脚本的进程内存一直保持 1G 左右的占用，当插入 1600 ｗ，内存开始持续飙升，最后达到 22G ，导致触发 OOM 机制， Python 进程被内核 kill ，差点怀疑人生。。大家在遇到 Python 内存泄露都是怎么定位的？

firebroo

2016-12-22 23:06:00 +08:00

@miraclinger 嗯，我看了你的链接，官方的意思是推荐从一次导入 1000-5000 条开始测试直到找到最佳 performance 吧, 可能我的不是最佳，但是和这个应该没有关系,分割为小文件我导入我想过（现在我朋友推荐我使用 Java 的 API 用 9300 端口走 TCP 导入)，但是我其实想找到内存泄露的原因呢。
@WKPlus 试过了，依然 oom ，我还试过 del 之后用 gc 库显示回收 gc ，也是炸裂。