select * into #temp_main from test_rpa_raw..[dc_llq_gg_cwfx_bxyzxzb] where modtime >= ${modtime} create table #temp_name( rec_id bigint, msg varchar(1024), flag varchar(2), highlighted varchar(1024) ) insert into #temp_name select rec_id,'code/name/bgq/bz不得为空' as msg, '1' as flag,'code,name,bgq,bz' as highlighted from #temp_main where isnull(code,'')='' or isnull(name,'')='' or isnull(bgq,'')='' or isnull(bz,'')='' insert into #temp_name select rec_id,'下级不为空上级不得为空' as msg, '1' as flag,'code,name,bgq,bz' as highlighted from #temp_main where ((isnull(t001,'')<>'' and isnull(t002,'')='') or (isnull(t003,'')<>'' and isnull(t004,'')='')) INSERT INTO #temp_name SELECT rec_id, '数据错位' AS msg, '1' AS flag, 'code,name,bgq,bz' AS highlighted FROM #temp_main WHERE (ISNULL(t001, '') <> '' AND ISNULL(t002, '') <> '' AND t001 < t003) OR (ISNULL(t004, '') <> '' AND ISNULL(t002, '') <> '' AND t002 < t004) insert into #temp_name select rec_id, '下级数据不得大于上级数据' as msg, '1' as flag, 'code,name,bgq,bz' as highlighted from #temp_main where (isnull(t001, 0) <> 0 and isnull(t002, 0) <> 0 and t001 > t002) or (isnull(t003, 0) <> 0 and isnull(t004, 0) <> 0 and t003 > t004) INSERT INTO #temp_name SELECT rec_id, '相邻两字段不得相同' AS msg, '1' AS flag, 'code,name,bgq,bz' AS highlighted FROM #temp_main WHERE (isnull(code, '') <> '' AND isnull(name, '') <> '' AND code <> name) AND (isnull(name, '') <> '' AND isnull(bgq, '') <> '' AND name <> bgq) AND (isnull(bgq, '') <> '' AND isnull(bz, '') <> '' AND bgq <> bz); insert into #temp_name select rec_id,'重复记录' as msg, '1' as flag,'code,name,bgq,bz' as highlighted from #temp_main where isnull(code,'')+isnull(name,'')+isnull(bgq,'')+isnull(bz,'')+isnull(t001,'')+isnull(t002,'')+isnull(t003,'')+isnull(CAST(t004 AS VARCHAR(10)),'') in (select isnull(code,'')+isnull(name,'')+isnull(bgq,'')+isnull(bz,'')+isnull(t001,'')+isnull(t002,'')+isnull(t003,'')+isnull(CAST(t004 AS VARCHAR(10)),'') from #temp_main group by code,name,bgq,bz,t001,t002,t003,t004 having count(1)>1) INSERT INTO #temp_name SELECT rec_id, 'code格式有误' AS msg, '1' AS flag, 'code,name,bgq,bz' AS highlighted FROM #temp_main WHERE NOT (ISNULL(code, '') LIKE '_____.HK') INSERT INTO #temp_name SELECT rec_id, 'name字段格式有误' AS msg, '1' AS flag, 'code,name,bgq,bz' AS highlighted FROM #temp_main WHERE name NOT LIKE '%[一-龥]%' INSERT INTO #temp_name SELECT rec_id, 'bgq字段格式有误' AS msg, '1' AS flag, 'code,name,bgq,bz' AS highlighted FROM #temp_main WHERE NOT (bgq LIKE '%年报%' OR bgq LIKE '%季报%' OR bgq LIKE '%中报%') INSERT INTO #temp_name SELECT rec_id, 'bgq字段长度不匹配' AS msg, '1' AS flag, 'code,name,bgq,bz' AS highlighted FROM #temp_main WHERE LEN(bgq) NOT IN (LEN('2023年年报'), LEN('2024年二季/中报')) INSERT INTO #temp_name SELECT rec_id, 'bz字段内容有误' AS msg, '1' AS flag, 'code,name,bgq,bz' AS highlighted FROM #temp_main WHERE ISNULL(bz, '') NOT IN ('原始币种') select * from #temp_name; drop table #temp_name; drop table #temp_main;有什么问题

1周前

全网

文档

学术

百科

知识库

来源0

同主题0

来源引用的0

引用来源的0

百度学术

信任

万里挑一

基于Spark的大数据清洗设计与研究 - 百度学术

打开网页

基于Spark的大数据清洗设计与研究. 大数据技术之中核心是数据分析,但在真正解决大数据问题之时,大部分工作基本都集中在数据清洗阶段,大数据清洗是大数据处理的基础.Spark作 …

Wiley Online Library

信任

SQL for Data Scientists | Wiley Online Books

打开网页

Jump-start your career as a data scientist—learn to develop datasets for exploration, analysis, and machine learning SQL for Data Scientists: A Beginner's Guide …SummaryIndexWindow Functions and SubqueriesAnswers to ExercisesAnalytical Dataset Development Examples

2021-09-07发表

science

信任

Editorial Retraction | Science Advances

打开网页

On 20 September 2017, Sci. Adv. published the Research Article “Structural analysis of BRCA1 reveals modification hotspot” by Liang et al. (1). After publication, …

2024-09-11发表

查看更多来源（5）

回答简洁模式

深入研究一些

代码中存在一些逻辑和语法错误。

问题概述

数据清洗逻辑：代码主要进行数据清洗，检查数据完整性和格式正确性。
错误和不足：
1. 重复记录检查：使用字符串拼接检查重复记录可能效率低下。
2. 字段格式检查：使用 LIKE 语句进行格式检查可能不够精确。
3. 字段长度检查：使用 LEN 函数检查长度可能不够灵活。
4. 代码冗余：多个插入语句重复相似逻辑，可以优化。

优化建议

使用临时表：将重复逻辑提取到临时表中，减少代码冗余。
使用正则表达式：使用正则表达式进行更精确的格式检查。
使用聚合函数：使用聚合函数检查重复记录，提高效率。

优化后的代码

-- 创建临时表存储清洗后的数据
create table #temp_cleaned_data(
    rec_id bigint, 
    code varchar(1024),
    name varchar(1024),
    bgq varchar(1024),
    bz varchar(1024),
    t001 varchar(1024),
    t002 varchar(1024),
    t003 varchar(1024),
    t004 varchar(1024)
);

-- 插入数据并进行格式和完整性检查
insert into #temp_cleaned_data
select rec_id, code, name, bgq, bz, t001, t002, t003, t004
from test_rpa_raw..[dc_llq_gg_cwfx_bxyzxzb]
where modtime >= ${modtime}
and code like '_____.HK'
and name like '%[一-龥]%'
and (bgq like '%年报%' or bgq like '%季报%' or bgq like '%中报%')
and len(bgq) in (len('2023年年报'), len('2024年二季/中报'))
and bz in ('原始币种');

-- 检查重复记录
insert into #temp_name
select rec_id, '重复记录' as msg, '1' as flag, 'code,name,bgq,bz' as highlighted
from #temp_cleaned_data
group by code, name, bgq, bz, t001, t002, t003, t004
having count(1) > 1;

-- 检查数据错位
insert into #temp_name
select rec_id, '数据错位' as msg, '1' as flag, 'code,name,bgq,bz' as highlighted
from #temp_cleaned_data
where (isnull(t001, '') <> '' and isnull(t002, '') <> '' and t001 < t003)
   or (isnull(t004, '') <> '' and isnull(t002, '') <> '' and t002 < t004);

-- 检查下级数据不得大于上级数据
insert into #temp_name
select rec_id, '下级数据不得大于上级数据' as msg, '1' as flag, 'code,name,bgq,bz' as highlighted
from #temp_cleaned_data
where (isnull(t001, 0) <> 0 and isnull(t002, 0) <> 0 and t001 > t002)
   or (isnull(t003, 0) <> 0 and isnull(t004, 0) <> 0 and t003 > t004);

-- 检查相邻两字段不得相同
insert into #temp_name
select rec_id, '相邻两字段不得相同' as msg, '1' as flag, 'code,name,bgq,bz' as highlighted
from #temp_cleaned_data
where (isnull(code, '') <> '' and isnull(name, '') <> '' and code = name)
   or (isnull(name, '') <> '' and isnull(bgq, '') <> '' and name = bgq)
   or (isnull(bgq, '') <> '' and isnull(bz, '') <> '' and bgq = bz);

-- 选择结果
select * from #temp_name;

-- 清理临时表
drop table #temp_cleaned_data;
drop table #temp_name;

总结

优化点：通过创建临时表和使用正则表达式，提高了代码的可读性和执行效率。
注意事项：确保所有字段名和逻辑与实际数据表一致，避免硬编码。

你觉得结果怎么样？

数据清洗的常用方法有哪些？

如何提高数据清洗的效率？

数据清洗中常见的错误类型有哪些？

如何优化重复记录的检查逻辑？

使用正则表达式检查格式有哪些优势？

数据错位检查逻辑是否适用于所有场景？

简洁

正在语音输入中...

以上内容由AI搜集生成，仅供参考

在线客服