1. df.na.fill({field name 1':'default','field name 2':'default'}) replaces null values
2. df.dropDuplicaates() de-duplicate according to the field name, empty parameter for all fields
3. df.subtract(df1) returns the elements that appear in the current DF and do not appear in df1, and do not weigh.
4. print time.localtime([timestamp]) in the following format
time.struct_time(tm_year=2018, tm_mon=10, tm_mday=9, tm_hour=16, tm_min=52, tm_sec=10, tm_wday=1, tm_yday=282, tm_isdst=0)
5. print time. strptime ('2018-12-14 15:45:12','%Y-%m-%d%H:%M:%S') is in the following format
time.struct_time(tm_year=2018, tm_mon=12, tm_mday=14, tm_hour=15, tm_min=45, tm_sec=12, tm_wday=4, tm_yday=348, tm_isdst=-1)
6. Time stamp formatting time
time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp)) # Format to 2016-03-2011:45:39 print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # Format as Sat Mar 2822:24:24 2016 print time.strftime("%a %b %d %H:%M:%S %Y", time.localtime())
7. Timestamp
print time.mktime(time.strptime('2018-12-14 15:45:12', '%Y-%m-%d %H:%M:%S'))
8. How many days before and after a certain day
def getPreDate(cp, days): #Conversion to Time cp_from = (datetime.strptime(str(cp), '%Y%m%d%H') + timedelta(days)).strftime('%Y%m%d%H') return cp_from
9. The usage of os.path.join(path_prefix,'parquet','path')
print os.path.join('/user/bbd/dev/xxxx/xxxx/', 'parquet/', 'ccccc.txt') /user/bbd/dev/xxxx/xxxx/parquet/ccccc.txt
10. The method of calculating the difference between two times by calculating the number of days
def get_history_date(): start_date = datetime(2018,10,10) end_date = datetime(2018,10,31) print (end_date - start_date).days #Return to the day of difference print end_date - start_date #Returning to the discrepant for i in range(0,(end_date - start_date).days): cp_from = (start_date + timedelta(days=i)).strftime('%Y%m%d00') cp_end = (end_date + timedelta(days=i + 1)).strftime('%Y%m%d00') print str(cp_from) + '\t' + str(cp_end) return: as follows 21 21 days, 0:00:00 2018101000 2018110100 2018101100 2018110200 2018101200 2018110300 . . . .
11. Not means to take the opposite. The string is empty, none, the number is 0, the object is empty, and all returned after not are true.
def not_ceshi(): a = '' b='as' c= None d=0 e=-1 if not a: print 'a For false' if b: print 'b To be true' if not c: print 'c For false' if not d: print 'd For 0' if e: print 'e by-1' //Result: a For false b To be true c For false d For 0 e by-1
12. The drop of dataframe can be sorted by partition first, and then by drop. num is the row_number() field sorted by partition after window.
df.where(num=='1').drop(num) Return the selection num 1 and remove the num field
13. Use of sorted higher-order functions
def cmp_ignore_case(s1, s2): if s1.lower()[:1] > s2.lower()[:1]: return 1 elif s1.lower()[:1] < s2.lower()[:1]: return -1 else: return 0 #sorted redefines the sort, which can pass in only one parameter, given (key = custom function for the parameters of comparison) cmp_ignore_case2 = lambda s: s[:1].lower() print sorted(['bob', 'about', 'Zoo', 'Credit'], key=cmp_ignore_case2) print sorted(['bob', 'about', 'Zoo', 'Credit'], cmp_ignore_case) //The result returns the same: ['about', 'bob', 'Credit', 'Zoo'] ['about', 'bob', 'Credit', 'Zoo']
14. When we need to use unionall for data frame interoperability, if the number of fields is different, we can fill in blank fields for unionall.
SQL = the same field name of'SELECT',''the same field name... ' Equivalent to complementary fields
15. Verify if there is a personal email address.
email_regx = '([a-zA-Z0-9]{1}[a-zA-Z0-9\.\+\-.]{0,63}@{1}[a-zA-Z0-9\-_]+(\.[a-zA-Z0-9\-_]+){1,4})' def udf_format_email_string(data): try: data = data.strip() email = re.findall(email_regx, data) print email if email: res = email[0][0] except_email = ['np-reply@','noreply@','service@','support@','info@','Postmaster@' ,'custmer_service@','admin@','VOICE=','notification@','10086@','10000sx@','10086cq@'] for item in except_email: if res.startswith(item): return '' return res except: pass return '' //Yes, return to the mailbox, not empty.
There's a point of knowledge here, re.findall(regx,data), which is regx here. There are two cases, with and without brackets, and with brackets.
import re string="abcdefg acbdgef abcdgfe cadbgfe" # The difference between parentheses and non-parentheses # Braces and parentheses regex=re.compile("((\w+)\s+\w+)") print(regex.findall(string)) # Output: [('abcdefg acbdgef','abcdefg', ('abcdgfe cadbgfe','abcdgfe')] # With parentheses only regex1=re.compile("(\w+)\s+\w+") print(regex1.findall(string)) # Output: ['abcdefg','abcdgfe'] # No parentheses regex2=re.compile("\w+\s+\w+") print(regex2.findall(string)) # Output: ['abcdefg acbdgef','abcdgfe cadbgfe'] return : regex: Returns a tuple (braces matched, braces matched) regex1: Returns matches in parentheses regex2: All matches returned
16. In pyspark, data frame is operated, and a Chinese field is max(col) operated. The first one that appears after grouping is similar to the one that can be duplicated by name. A mailbox can only correspond to one name.
select email format_data(max(name)) from email_detal group by email Grouping by email and de-duplicating names, but there may be differences in names
17. Some api examples of dataframe
def get_df(): conf = SparkConf() spark = SparkSession.builder \ .master("local[*]") \ .appName("get_df") \ .config(conf=conf) \ .enableHiveSupport() \ .getOrCreate() rdd1 = spark.sparkContext.parallelize([ ('make',24,198), ('make',23,198), ('tubu',24,198), ('tubu',23,198), ('mark',24,198), ('mark',23,198), ('uzi',24,198), ('uzi',23,197) ]) rdd2 = spark.sparkContext.parallelize([ ('make',24,198), ('tubu',24,198) ]) schema = StructType([ StructField('name',StringType(), True), StructField('age',LongType(), True), StructField('hight',LongType(), True) ]) df1 = spark.createDataFrame(rdd1,schema) df2 = spark.createDataFrame(rdd2,schema) df1.createTempView('tmp') sql = 'select *, row_number() over (partition by name,age order by hight) as num from tmp' sql2 = 'select `name`,age, case when age=24 then age else age+2 end re_age, hight,"table_name" tablename from tmp' spark.sql(sql).where('num=1').drop("num").show() #Delete a column field df1.subtract(df2).show() #Remove the exact same data from df2 in df1 df1.dropDuplicates(['name','age']).show() #Remove duplicate data from the specified two columns, without adding parameters, for all columns df1.drop_duplicates(['name','age']).show() #Ditto df1.join(df2,df1.name == df2.name,'left').show() #Various join spark.sql(sql2).drop('age').show() #Determine the value of another field based on some fields
18. Registration py file logging
import logging,os logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) Used to test whether there is relevant information
19. Clean the fields needed by multiple tables into the parquet file. Which of the following dimensions need to be read and filtered by ourselves. First, we need to understand the information of the basic table. Then, according to the meaning of the field, we need to clean the corresponding fields, such as the data of five tables in total. To write into the general table, we can do the following.
def creat_dataframe(source): '''Get all the information from each source table to the source information summary table''' df = spark.sql("select * from {tablename}".format(tablename=source["table_name"])) if source.get("before_where"): df = df.where(source["before_where"]) column_source = OrderedDict(zip(source["column"],source["source"])) columns = ["%s as %s"%(table_file,table_name) for table_name,table_file in column_source.items()] df = df.selectExpr(*columns) if source.get("after_where"): df = df.where(source["after_where"]) return df def etl_school(): columns = ["NAME", "SSQX", "XYMC", "TABLENAME", "table_order"] where = "verify_school(NAME) = 1" data_source = [ { "table_name":"tb_gaw_dzxxyryxsjfmxx", "column":columns, "source":["DWMC", "SSQX", "''", "'tb_gaw_dzxxyryxsjfmxx'", 5], "after_where":where }, { "table_name":"tb_gaw_jw_xxjzygxx", "column":columns, "source":["DWMC", "SSQX", "''", "'tb_gaw_jw_xxjzygxx'", 1], "after_where":where }, { "table_name":"tb_gaw_jw_zxyjs_new", "column":columns, "source":["XXMC", "''", "FYMC", "'tb_gaw_jw_zxyjs_new'", 2], "after_where":where }, { "table_name":"tb_gaw_jw_zxdxs_new", "column":columns, "source":["XXMC", "''", "FYMC", "'tb_gaw_jw_zxdxs_new'", 3], "after_where":where } ] dfs = map(creat_dataframe, data_source) df_union = reduce(lambda x, y: x.union(y), dfs) df = df_union.selectExpr("*", "row_number() over (partition by NAME,XYMC order by table_order asc) as num") df_tmp = df.where("num=1").drop("num") df.tmp = df_tmp.where("NAME != XYMC") write_parquet(df_tmp, "tmp/etl_school") logger.info("save to /tmp/etl_school success!!")