In the work, the ID number in the Hive table needs to be strongly checked. Because the last bit is the checking bit, simple regularization can not be realized, and UDF is used to implement the relevant functions.
I just realized the function, did not do in-depth optimization, welcome to leave a message, under the guidance of how to optimize, I am grateful.
The source code is as follows.
import org.apache.hadoop.hive.ql.exec.UDF; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashSet; import java.util.regex.Pattern; /** * Check ID number * Strong verification rules for ID card format: * 1,The structural citizenship number of a number is a feature combination code, which consists of a 17-digit digital ontology code and a check code. The order from left to right is: six digit address code, eight digit date of birth code, three digit sequence code and one digit check code. * 2,Address code (the first six digits) denotes the administrative division code of the county (city, flag, district) where the permanent residence is located, which is implemented according to GB/T 2260. Specific reference: http://www.mca.gov.cn/article/sj/xzqh/2019/ * 3,The date of birth code (7th to 14th digits) denotes the year, month and day of birth of the coding object, which is implemented according to GB/T7408. There is no separator between the year, month and day codes. * 4,Sequence codes (fifteenth to seventeenth digits) denote the sequence numbers of people born in the same year, month and day within the area marked by the same address code. The odd numbers of sequence codes are assigned to men and even numbers to women. * 5,Check code (eighteenth digit) *(1)The weighted summation formulas S = Sum (Ai * Wi), I = 0,..., 16 for 17-digit digital ontology codes. First, the weighted summation of the first 17 digits is performed. * Ai:Represents the number value Wi of the ID card number at position i: Represents the weighting factor Wi at position i: 7 9 10 5 8 4 2 1 6 3 7 9 10 8 4 *(2)Computation Y = mod(S, 11) (3) gets the corresponding check code Y: 0 1 2 3 4 5 6 7 8 9 10: 10 X 9 8 7 6 5 4 2 by module. */ public class CheckIDNumber extends UDF{ //Digital regularization private static final String regexNum = "^[0-9]*$"; //Rules for leap year birthdays private static final String regexBirthdayInLeapYear = "^((19[0-9]{2})|(2[0-9]{3}))((01|03|05|07|08|10|12)(0[1-9]|[1-2][0-9]|3[0-1])|(04|06|09|11)(0[1-9]|[1-2][0-9]|30)|02(0[1-9]|[1-2][0-9]))$"; //Ordinary Birthday Rules private static final String regexBirthdayInCommonYear = "^((19[0-9]{2})|(2[0-9]{3}))((01|03|05|07|08|10|12)(0[1-9]|[1-2][0-9]|3[0-1])|(04|06|09|11)(0[1-9]|[1-2][0-9]|30)|02(0[1-9]|1[0-9]|2[0-8]))$"; // Hong Kong Geographical Code Value private static final int HONGKONG_AREACODE = 810000; // Taiwan Territorial Code Value private static final int TAIWAN_AREACODE = 710000; // Macau Regional Code Value private static final int MACAO_AREACODE = 820000; // Coded values for other provinces private static final HashSet<Integer> provincesCode = new HashSet<Integer>(32); static { /* The area code initialized here is only the province code. Because after checking the complete area codes in GB/T 2260-1999, many of them do not correspond to the area codes of ID numbers. So there is no comparison of complete area codes. */ provincesCode.add(11); provincesCode.add(12); provincesCode.add(13); provincesCode.add(14); provincesCode.add(15); provincesCode.add(21); provincesCode.add(22); provincesCode.add(23); provincesCode.add(31); provincesCode.add(32); provincesCode.add(33); provincesCode.add(34); provincesCode.add(35); provincesCode.add(36); provincesCode.add(37); provincesCode.add(41); provincesCode.add(42); provincesCode.add(43); provincesCode.add(44); provincesCode.add(45); provincesCode.add(46); provincesCode.add(50); provincesCode.add(51); provincesCode.add(52); provincesCode.add(53); provincesCode.add(54); provincesCode.add(61); provincesCode.add(62); provincesCode.add(63); provincesCode.add(64); provincesCode.add(65); } public static boolean evaluate(String idNumber) { // Remove spaces and change letters to uppercase idNumber = idNumber.trim().toUpperCase(); // Identity Card Regular Check if (!checkIdNumberRegex(idNumber)) { return false; } // ID card area code check if (!checkIdNumberArea(idNumber.substring(0, 6))) { return false; } // Converting 15 ID Cards to 18 idNumber = convertFifteenToEighteen(idNumber); // Date of Birth Inspection of Identity Card if (!checkBirthday(idNumber.substring(6, 14))) { return false; } // Check of ID Card Check Code if (!checkIdNumberVerifyCode(idNumber)) { return false; } return true; } /** * Identity Card Regular Check */ private static boolean checkIdNumberRegex(String idNumber) { return Pattern.matches("^([0-9]{17}[0-9Xx])|([0-9]{15})$", idNumber); } /** * Converting 15 ID Cards to 18 */ private static String convertFifteenToEighteen(String idNumber) { if (15 != idNumber.length()) { return idNumber; } // Add 19 before the year and data checkpoint idNumber = idNumber.substring(0, 6) + "19" + idNumber.substring(6, 15); idNumber = idNumber + getVerifyCode(idNumber); return idNumber; } /** * ID card area code check * Note: This compares only the provincial codes, not the city and county codes, because many of the area codes in GB/T 2260-1999 do not correspond to the area codes of identity card numbers, so there is no comparison of the complete area codes. */ private static boolean checkIdNumberArea(String idNumberArea) { // Comparing codes of Hong Kong, Macao and Taiwan int areaCode = Integer.parseInt(idNumberArea); if (areaCode == HONGKONG_AREACODE || areaCode == MACAO_AREACODE || areaCode == TAIWAN_AREACODE) { return true; } // Compare the codes of other provinces Integer provincesCodeTemp = Integer.parseInt(idNumberArea.substring(0,2)); return provincesCode.contains(provincesCodeTemp); } /** * What is the date of birth of ID card? */ private static boolean checkBirthday(String idNumberBirthdayStr) { Integer year = null; try { year = Integer.valueOf(idNumberBirthdayStr.substring(0, 4)); } catch (Exception e) { } if (null == year) { return false; } boolean birthdayMatch; // Check whether the date of birth is a standard date format? if (isLeapYear(year)) { // Leap year birthdayMatch = Pattern.matches(regexBirthdayInLeapYear, idNumberBirthdayStr); } else { birthdayMatch = Pattern.matches(regexBirthdayInCommonYear, idNumberBirthdayStr); } if(birthdayMatch){ // Birth date can only be less than or equal to the current system date SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMdd"); Date birthday; try { birthday = formatter.parse(idNumberBirthdayStr); } catch (ParseException e) { e.printStackTrace(); return false; } int number = birthday.compareTo(new Date()); return number == -1 || number ==0; } return birthdayMatch; } /** * Judging whether it is a leap year */ private static boolean isLeapYear(int year) { return (year % 400 == 0) || (year % 100 != 0 && year % 4 == 0); } /** * Calculate ID card check code according to the first 17 bits of ID card */ private static String getVerifyCode(String idNumber) { if (!Pattern.matches(regexNum, idNumber.substring(0, 17))) { return null; } String[] ValCodeArr = { "1", "0", "X", "9", "8", "7", "6", "5", "4", "3", "2" }; String[] Wi = { "7", "9", "10", "5", "8", "4", "2", "1", "6", "3", "7", "9", "10", "5", "8", "4", "2" }; int sum = 0; for (int i = 0; i < 17; i++) { sum = sum + Integer.parseInt(String.valueOf(idNumber.charAt(i))) * Integer.parseInt(Wi[i]); } return ValCodeArr[sum % 11]; } /** * Check of ID Card Check Code */ private static boolean checkIdNumberVerifyCode(String idNumber) { return getVerifyCode(idNumber).equalsIgnoreCase(idNumber.substring(17)); } }
Upload jar packages to hdfs after packaging
create function hive_udf.check_idnumber as 'com.business.bi.udf.CheckIDNumber' USING JAR 'hdfs://hadoop1/user/hive/warehouse/hive_udf.db/MyHiveUDF.jar';
Reference resources: https://blog.csdn.net/zhengyong15984285623/article/details/51784731
Some modifications have been made:
1. Packaging into UDF;
2. The area number is changed from the checking area to the provinces.
3. Expanding the validity of checking birthday date, mainly by comparing the current date.