import com.bean.Yyds1 import org.apache.spark.sql.SparkSession object TestReadCSV { def main(args: Array[String]): Unit = { val spark = SparkSession.builder() .appName("CSV Reader") .master("local") .getOrCreate() /** * Parameters can be either strings or specific types, such as boolean * delimiter Separator, comma by default, * nullValue Specify a string to represent the null value * quote Quote character, double quote by default * header First row not as data content, as heading * inferSchema Auto guess field type * ignoreLeadingWhiteSpace Trim the space in front * ignoreTrailingWhiteSpace Trim the space behind * nullValue Null setting, if you don't want to use any symbols as null values, you can assign null * multiline Run multiple columns, using over 62 columns * encoding Specify encoding, such as GBK / UTF-8 Unicode GB2312 * ** */ import spark.implicits._ val result = spark.read.format("csv") .option("delimiter", "\\t") .option("encoding","GB2312") .option("enforceSchema",false) .option("header", "true") // .option("header", false) .option("quote", "'") .option("nullValue", "\\N") .option("ignoreLeadingWhiteSpace", false) .option("ignoreTrailingWhiteSpace", false) .option("nullValue", null) .option("multiline", "true") .load("G:\\python\\yyds\\yyds_1120_tab.csv").as[Yyds1] //yyds_1120_tab.csv aa1.csv yyds_20211120 yyds_1120_tab2_utf-8 result.map(row => { row.ji_check_cnt.toInt }).foreachPartition(a => {a.foreach(println _)}) } }
pom dependency
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>org.example</groupId> <artifactId>TmLimitPredict</artifactId> <version>1.0-SNAPSHOT</version> <properties> <log4j.version>1.2.17</log4j.version> <slf4j.version>1.7.22</slf4j.version> <!--0.8.2-beta 0.8.2.0 0.8.2.1 0.8.2.2 0.9.0.1 0.10.0.0 0.10.1.0 0.10.0.1 0.10.2.0 1.0.0 2.8.0--> <kafka.version>2.8.0</kafka.version> <spark.version>2.2.0</spark.version> <scala.version>2.11.8</scala.version> <jblas.version>1.2.1</jblas.version> <hadoop.version>2.7.3</hadoop.version> </properties> <dependencies> <!--Introducing a common log management tool--> <!-- <dependency>--> <!-- <groupId>org.slf4j</groupId>--> <!-- <artifactId>jcl-over-slf4j</artifactId>--> <!-- <version>${slf4j.version}</version>--> <!-- </dependency>--> <!-- <dependency>--> <!-- <groupId>org.slf4j</groupId>--> <!-- <artifactId>slf4j-api</artifactId>--> <!-- <version>${slf4j.version}</version>--> <!-- </dependency>--> <!-- <dependency>--> <!-- <groupId>org.slf4j</groupId>--> <!-- <artifactId>slf4j-log4j12</artifactId>--> <!-- <version>${slf4j.version}</version>--> <!-- </dependency>--> <!-- <dependency>--> <!-- <groupId>log4j</groupId>--> <!-- <artifactId>log4j</artifactId>--> <!-- <version>${log4j.version}</version>--> <!-- </dependency>--> <!-- Spark Dependency introduction of --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>${spark.version}</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>${spark.version}</version> <exclusions> <exclusion> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> </exclusion> </exclusions> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> <version>15.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.11</artifactId> <version>${spark.version}</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.11</artifactId> <version>${spark.version}</version> <!--<scope>provided</scope>--> </dependency> <!-- Introduce Scala --> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>${scala.version}</version> <!--<scope>provided</scope>--> </dependency> <!--MLlib--> <dependency> <groupId>org.scalanlp</groupId> <artifactId>jblas</artifactId> <version>${jblas.version}</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-mllib_2.11</artifactId> <version>${spark.version}</version> <!--<scope>provided</scope>--> </dependency> <!-- kafka --> <dependency> <groupId>org.apache.kafka</groupId> <artifactId>kafka-clients</artifactId> <version>${kafka.version}</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka-0-10_2.11</artifactId> <version>${spark.version}</version> <!--<scope>provided</scope>--> </dependency> <dependency> <groupId>com.sf.kafka</groupId> <artifactId>sf-kafka-api-core</artifactId> <version>2.4.1</version> <!--<scope>provided</scope>--> </dependency> <!-- lombok generate get,set Method Tools--> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <version>1.16.18</version> <scope>provided</scope> </dependency> </dependencies> <build> <!-- <sourceDirectory>src/main/scala</sourceDirectory>--> <sourceDirectory>src/main/java</sourceDirectory> <testSourceDirectory>src/test/scala</testSourceDirectory> <resources> <resource> <directory>src/main/resources</directory> <includes> <include>**/*.properties</include> <include>**/*.xml</include> </includes> <!-- Exclude configuration files (on runtime notes, make IDE Read the configuration file; when packaging, release comments to make the configuration file external for easy modification) can not be configured. maven-jar-plugin Configured below --> <!--<excludes> <exclude>config.properties</exclude> </excludes>--> </resource> <!-- Resources external to the profile (stored in conf Directory, also classpath Path, which is configured below)--> <!--<resource> <directory>src/main/resources</directory> <includes> <include>config.properties</include> </includes> <targetPath>${project.build.directory}/conf</targetPath> </resource>--> </resources> <plugins> <!--scala Compile Packaging Plug-ins--> <plugin> <groupId>net.alchim31.maven</groupId> <artifactId>scala-maven-plugin</artifactId> <version>3.2.2</version> <!-- <groupId>org.scala-tools</groupId>--> <!-- <artifactId>maven-scala-plugin</artifactId>--> <!-- <version>2.15.2</version>--> <executions> <execution> <id>scala-compile-first</id> <phase>process-resources</phase> <goals> <goal>add-source</goal> <goal>compile</goal> </goals> </execution> </executions> </plugin> <!--java Compile Packaging Plug-ins--> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.1</version> <configuration> <source>1.8</source> <target>1.8</target> <encoding>UTF-8</encoding> </configuration> <executions> <execution> <phase>compile</phase> <goals> <goal>compile</goal> </goals> </execution> </executions> </plugin> <!-- ③Make one zip Package, when publishing a project, will zip package copy On the server, directly unzip xxx.zip,It contains the jar And dependent lib,There are also configurations config File to start the service directly --> <plugin> <artifactId>maven-dependency-plugin</artifactId> <executions> <execution> <phase>process-sources</phase> <goals> <goal>copy-dependencies</goal> </goals> <configuration> <excludeScope>provided</excludeScope> <outputDirectory>${project.build.directory}/lib</outputDirectory> </configuration> </execution> </executions> </plugin> <!--The configuration of maven-jar-plugin--> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-jar-plugin</artifactId> <version>2.4</version> <!--The configuration of the plugin--> <configuration> <!-- Do not package resource files (configuration files separate from dependent packages) --> <excludes> <!-- <exclude>*.properties</exclude>--> <!-- <exclude>*.xml</exclude>--> <exclude>*.txt</exclude> </excludes> <!--Configuration of the archiver--> <archive> <!--Generated jar Do not include pom.xml and pom.properties These two files--> <addMavenDescriptor>false</addMavenDescriptor> <!--Manifest specific configuration--> <manifest> <!--Is third party included jar put to manifest Of classpath in--> <!-- <addClasspath>true</addClasspath>--> <addClasspath>false</addClasspath> <!--Generated manifest in classpath Prefix, because you want a third party jar put to lib Directory, so classpath The prefix is lib/--> <classpathPrefix>lib/</classpathPrefix> <!--Applied main class--> <!-- <mainClass>com.sf.tmlimit.TmLimitPredStream</mainClass>--> <mainClass>ConnectKafkaTest</mainClass> </manifest> <!-- Add key-value pairs to the manifest file, increase classpath Path, where the conf The directory is also set to classpath Route --> <manifestEntries> <!-- <Class-Path>conf/</Class-Path>--> <Class-Path>lib/</Class-Path> </manifestEntries> </archive> <!--Filter out unwanted inclusions jar Files in--> <!-- <excludes> <exclude>${project.basedir}/xml/*</exclude> </excludes>--> </configuration> </plugin> <!--The configuration of maven-assembly-plugin--> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-assembly-plugin</artifactId> <version>2.4</version> <!--The configuration of the plugin--> <configuration> <!--Specifies the configuration file of the assembly plugin--> <descriptors> <descriptor>src/main/assembly/assembly.xml</descriptor> </descriptors> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build> </project>
parameter | explain |
sep | By default, specify a single character to split fields and values |
encoding | The default is uft-8 to decode by a given encoding type |
quote | The default is ", where the delimiter can be part of the value and set a single character to escape the quoted value. If you want to close the quotation marks, you need to set an empty string instead of null. |
escape | The default (\) setting is that a single character is used to escape quotation marks within quotation marks |
charToEscapeQuoteEscaping | The default is the escape character (above escape) or \0. When the escape character is different from the quote character, the default is the escape character (otherwise\0) |
comment | The default is null, which sets the single character used to skip a line, beginning with that character. By default, it is disabled |
header | The default is false, with the first row as the column name |
enforceSchema |
The default is true, and if set to true, the specified or inferred pattern will be enforced on the data source file and the header in the CSV file will be ignored. If the option is set to false, the header option is set to true, which validates the mode against all titles in the CSV file. Field names in schemas and column names in CSV headers are checked based on their location and *spark.sql.caseSensitive is considered. Although the default value is true, it is recommended that the enforceSchema option be disabled to avoid erroneous results |
inferSchema | inferSchema (default false`): Automatically infer the input mode from the data. * An additional transfer of data is required |
samplingRatio | Default is 1.0, which defines the score of rows used for pattern inference |
ignoreLeadingWhiteSpace | Default to false, a flag indicating whether leading spaces in the value being read should be skipped |
ignoreTrailingWhiteSpace | Default to false a flag indicating whether the end space of the value being read should be skipped |
nullValue | The default is an empty string, which sets the string representation of the null value. Starting with 2.0.1, this applies to all supported types, including string types |
emptyValue | The default is an empty string, setting a string representation of an empty value |
nanValue | The default is Nan, which sets the string representation for non-numbers |
positiveInf | Default is Inf |
negativeInf | Default is -Inf to set the string representation of negative infinity |
dateFormat |
The default is yyyy-MM-dd, which sets the string indicating the date format. Custom date formats follow the format in java.text.SimpleDateFormat. This applies to date types |
timestampFormat |
The default is yyyy-MM-dd'T'HH:mm:ss.SSSXXX, which sets the string representing the timestamp format. Custom date formats follow the format in java.text.SimpleDateFormat. This applies to timestamp types |
maxColumns | The default is a hard setting for how many columns 20480 defines |
maxCharsPerColumn | The default is -1, which defines the maximum number of characters allowed for any given value read. By default, -1 means unlimited length |
mode |
The default (allowed) allows a mode for handling corrupt records during parsing. It supports the following case-insensitive modes. Note that Spark attempts to resolve only the columns necessary in the CSV under column pruning. Therefore, broken records can vary depending on the set of fields required. This behavior can be controlled through spark.sql.csv.parser.columnPruning.enabled (enabled by default). |
Parameters below mode l: | --------------------------------------------------- |
PERMISSIVE |
When it encounters a corrupted record, place the malformed string in the * field configured by "columnNameOfCorruptRecord" and set the other fields to "null". To preserve corrupted records, the user can set a user-defined mode named columnNameOfCorruptRecord |
DROPMALFORMED | Ignore the entire broken record |
FAILFAST | Exception thrown when a corrupted record is encountered |
----End of the mode l parameter-- | ------------------------------------------------------- |
columnNameOfCorruptRecord | The default value specifies in spark.sql.columnNameOfCorruptRecord, allowing renaming of malformed new fields created by PERMISSIVE mode. This overrides the spark.sql.columnNameOfCorruptRecord |
multiLine | Default is false, parsing a record that may exceed 62 columns |