go 协程比 Python 多进程快好多!

2023-12-13 15:04:09 +08:00
 777777

需求:2 个 list(alist,blist),alist 每个值与 blist 每个值做字符串相似度计算,两个 list 数量级为 20 万 下面为 python 和 go 的代码片段 python:

# 计算两个字符串的相似度
def similar(a, b):
    """计算两个字符串的相似度。如果有一个是 None ,则返回 0 。"""
    if a is None or b is None:
        return 0
    similarity = fuzz.token_set_ratio(a, b.lower()) / 100
    print("similar a:", a, ",", "b:", b, ", similarity:", similarity)
    return similarity


def compute_similarity(args):
    record_name, name = args
    return similar(record_name, name), name


# 更新数据库记录
def update_database(cursor, name_mapping, csv_data):
    update_sql = "UPDATE tweb_fingerprint_test SET factory = %s WHERE uuid = %s"
    num = 0

    # 创建一个反向映射,使我们可以快速地通过名称查找 UUID
    name_to_uuid = defaultdict(list)
    for uuid, names in name_mapping.items():
        for name in names:
            if name:  # 检查 name 是否为 None 或空
                name_to_uuid[name].append(uuid)

    updates = []
    with ProcessPoolExecutor() as executor:
        for row in csv_data:
            vendor_name = row.get("vendor")
            record_name = row.get("name")
            print("record_name:", record_name)
            if (
                vendor_name is None
                or record_name is None
                or vendor_name in ["未知", "None"]
            ):
                continue  # 跳过这行数据

            # 直接查找名称
            uuids_to_update = name_to_uuid.get(record_name, [])

            # 如果没有直接匹配,尝试查找相似度超过 98%的名称
            if not uuids_to_update:
                tasks = [(record_name, name) for name in name_to_uuid]
                results = executor.map(compute_similarity, tasks)
                uuids_to_update.extend(
                    name_to_uuid[name]
                    for similarity, name in results
                    if similarity > 0.98
                )

            # 如果找到 UUID ,加入到更新列表中
            for uuid_to_update in uuids_to_update:
                updates.append((vendor_name, uuid_to_update))

    # 批量更新
    if updates:
        cursor.executemany(update_sql, updates)
        num = len(updates)

    # 返回更新的记录数
    return num

go:

// Similar calculates the similarity between two strings
func Similar(a, b string) float64 {
	return smetrics.JaroWinkler(a, b, 0.7, 4)
}

// UpdateDatabase updates the database with the new vendor information
// UpdateDatabase updates the database with the new vendor information
func UpdateDatabase(db *sql.DB, vendors map[string]Vendor, records []CSVRecord) (int, error) {
	fmt.Println("records", len(records))
	fmt.Println("vendors", len(vendors))
	stmt, err := db.Prepare("UPDATE tweb_fingerprint SET factory = ? WHERE uuid = ?")
	if err != nil {
		return 0, err
	}
	defer stmt.Close()

	var wg sync.WaitGroup
	updates := make(chan Updatedata, len(records))

	for _, record := range records {
		wg.Add(1)
		go func(record CSVRecord) {
			defer wg.Done()
			// fmt.Println(record.Name)
			for _, vendor := range vendors {
				if record.Name == vendor.Name.String || Similar(record.Name, vendor.Name.String) > SimilarityThreshold {
					updates <- Updatedata{
						UUID:    vendor.UUID,
						Factory: record.Vendor,
					}
				}
			}
		}(record)
	}

	go func() {
		wg.Wait()
		close(updates)
	}()

	count := 0
	for update := range updates {
		fmt.Println("update:", update)
		if _, err := stmt.Exec(update.Factory, update.UUID); err != nil {
			return count, err
		}
		count++
	}

	return count, nil
}
924 次点击
所在节点    程序员
2 条回复
Baloneo
2023-12-13 16:27:22 +08:00
快多少?
777777
2023-12-13 17:23:30 +08:00
@Baloneo 至少 10 倍吧,python CPU 都打不满,没跑完我就重构成 go 了,go 十分钟就跑完了

这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。

https://www.v2ex.com/t/1000036

V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。

V2EX is a community of developers, designers and creative people.

© 2021 V2EX