Implementation of Enence2Vec Neural Network Based on Quick_Thought Vectors

Keywords: Python github network

I. Preface

1,Skip-Thought-Vector Paper https://github.com/ryankiros/skip-thoughts

2. This article assumes that the reader already understands Skip-Gram-Vector and RNN related foundations.

3. Quick_thoughtful Papers: Lajanugen Logeswaran, Honglak Lee, An efficient framework for learning sentence representations. In ICLR, 2018.

Two, actual combat

1. Make clauses on data (remove short sentences), delete sentences with high frequency, participle

def fenju(data):
    sentence=[]
    for i in range(len(data)):
        try:
            m = re.findall('. ',data[i][0])
            # print(m)
            if data[i][1] is not None and len(m)>0:
                if len(m)>1:
                    content=data[i][0].split('. ')
                    # print(content)
                    for c in range(len(content)):
                        if len(content[c])>10:
                            sentence.append(content[c]+'. ')
                elif len(data[i][0])>10:
                    sentence.append(data[i][0])
            else:
                continue
        except:
            continue
    return sentence

def _process_sentence_list(sentence_list, threshold=0.01):
    sentence_count = Counter(sentence_list)
    total_count = len(sentence_list)
    # Calculate sentence frequency
    sentence_freqs = {w: c / total_count for w, c in sentence_count.items()}
    # Remove sentences that appear too frequently
    sentence=[]
    for w in range(len(sentence_list)):
        if sentence_freqs[sentence_list[w]] < threshold:
            sentence.append(sentence_list[w])
        else:
            continue
    return sentence

def fenci(alltext, writefile, filename):
    if not os.path.exists(writefile):
        os.makedirs(writefile)
    sentence = [' '.join(jieba.lcut(''.join(text.split()))) for text in alltext]
    print(sentence)
    with open(os.path.join(writefile, filename), "w") as fw:
        fw.write("\n".join(sentence))

2. Build vocab and TFRecord files (see github code in detail)

3. Model input definition (train/eval/encode of three modes)

  def build_inputs(self):

    if self.mode == "encode":
      encode_ids = tf.placeholder(tf.int64, (None, None), name="encode_ids")
      encode_mask = tf.placeholder(tf.int8, (None, None), name="encode_mask")
    else:
      # Prefetch serialized tf.Example protos.
      input_queue = input_ops.prefetch_input_data(
          self.reader,
          FLAGS.input_file_pattern,
          shuffle=FLAGS.shuffle_input_data,
          capacity=FLAGS.input_queue_capacity,
          num_reader_threads=FLAGS.num_input_reader_threads)
      print("input_queue",input_queue)
      # Deserialize a batch.
      serialized = input_queue.dequeue_many(FLAGS.batch_size)
      encode = input_ops.parse_example_batch(serialized)
      encode_ids = encode.ids
      encode_mask = encode.mask
    self.encode_ids = encode_ids
    self.encode_mask = encode_mask

Since every sentence in our batch is padding, in order to prevent the impact of padding on training, we need to pass the mask to RNN Network - the original length of each sentence (encode_mask).

4. embedding input sentences

  def build_word_embeddings(self):
    rand_init = self.uniform_initializer
    self.word_embeddings = []
    self.encode_emb = []
    self.init = None
    for v in self.config.vocab_configs:
      if v.mode == 'fixed':
        if self.mode == "train":
          word_emb = tf.get_variable(
              name=v.name,
              shape=[v.size, v.dim],
              trainable=False)
          embedding_placeholder = tf.placeholder(
              tf.float32, [v.size, v.dim])
          embedding_init = word_emb.assign(embedding_placeholder)

          rand = np.random.rand(1, v.dim)
          word_vecs = np.load(v.embs_file)
          load_vocab_size = word_vecs.shape[0]
          assert(load_vocab_size == v.size - 1)
          word_init = np.concatenate((rand, word_vecs), axis=0)
          self.init = (embedding_init, embedding_placeholder, word_init)
        
        else:
          word_emb = tf.get_variable(
              name=v.name,
              shape=[v.size, v.dim])

        encode_emb = tf.nn.embedding_lookup(word_emb, self.encode_ids)
        self.word_emb = word_emb
        self.encode_emb.extend([encode_emb, encode_emb])#####

      if v.mode == 'trained':
        for inout in ["", "_out"]:
          word_emb = tf.get_variable(
              name=v.name + inout,
              shape=[v.size, v.dim],
              initializer=rand_init)
          if self.mode == 'train':
            self.word_embeddings.append(word_emb)

          encode_emb = tf.nn.embedding_lookup(word_emb, self.encode_ids)
          self.encode_emb.append(encode_emb)

      if v.mode == 'expand':
        for inout in ["", "_out"]:
          encode_emb = tf.placeholder(tf.float32, (
              None, None, v.dim), v.name + inout)
          self.encode_emb.append(encode_emb)
          word_emb_dict = read_vocab_embs(v.vocab_file + inout + ".txt",
              v.embs_file + inout + ".npy")
          self.word_embeddings.append(word_emb_dict)

      if v.mode != 'expand' and self.mode == 'encode':
        word_emb_dict = read_vocab(v.vocab_file)
        self.word_embeddings.extend([word_emb_dict, word_emb_dict])

Convert every word in a sentence into a vector of vocab size length. The three modes of v.mode are fixed (using pre-training embedding)/train (training)/expand (expansion). The final output is in the form of [encode_emb, encode_emb], which is used to get the link between the upper and lower sentences.

5. Building encoder

encoder encode s sentences to get the final hidden state, where a single-layer LSTM network bidirectional LSTM bidirectional GRU can be used.

  def _initialize_cell(self, num_units, cell_type="GRU"):
    if cell_type == "GRU":
      return tf.contrib.rnn.GRUCell(num_units=num_units)
    elif cell_type == "LSTM":
      return tf.contrib.rnn.LSTMCell(num_units=num_units)
    else:
      raise ValueError("Invalid cell type")


  def rnn(self, word_embs, mask, scope, encoder_dim, cell_type="GRU"):
    length = tf.to_int32(tf.reduce_sum(mask, 1), name="length")
    if self.config.bidir:
      if encoder_dim % 2:
        raise ValueError(
            "encoder_dim must be even when using a bidirectional encoder.")
      num_units = encoder_dim // 2
      cell_fw = self._initialize_cell(num_units, cell_type=cell_type)
      cell_bw = self._initialize_cell(num_units, cell_type=cell_type)
      outputs, states = tf.nn.bidirectional_dynamic_rnn(
          cell_fw=cell_fw,
          cell_bw=cell_bw,
          inputs=word_embs,
          sequence_length=length,
          dtype=tf.float32,
          scope=scope)
      if cell_type == "LSTM":
        states = [states[0][1], states[1][1]]
      state = tf.concat(states, 1)
    else:
      cell = self._initialize_cell(encoder_dim, cell_type=cell_type)
      outputs, state = tf.nn.dynamic_rnn(
          cell=cell,
          inputs=word_embs,
          sequence_length=length,
          dtype=tf.float32,
          scope=scope)
      if cell_type == "LSTM":
        state = state[1]
    return state

  def build_encoder(self):
    """Builds the sentence encoder.

    Inputs:
      self.encode_emb
      self.encode_mask

    Outputs:
      self.thought_vectors

    Raises:
      ValueError: if config.bidirectional_encoder is True and config.encoder_dim
        is odd.
    """
    names = ["", "_out"]
    self.thought_vectors = []
    for i in range(2):
      with tf.variable_scope("encoder" + names[i]) as scope:
        if self.config.encoder == "gru":
          sent_rep = self.rnn(self.encode_emb[i], self.encode_mask, scope, self.config.encoder_dim, cell_type="GRU")
        elif self.config.encoder == "lstm":
          sent_rep = self.rnn(self.encode_emb[i], self.encode_mask, scope, self.config.encoder_dim, cell_type="LSTM")
        elif self.config.encoder == 'bow':
          sent_rep = self.bow(self.encode_emb[i], self.encode_mask)
        else:
          raise ValueError("Invalid encoder")

        thought_vectors = tf.identity(sent_rep, name="thought_vectors")
        self.thought_vectors.append(thought_vectors)

It can be seen that [encode_emb, encode_emb] is encoded, and [thought_vectors,thought_vectors]

6. Constructing Loss Function

  def build_loss(self):
    """Builds the loss Tensor.

    Outputs:
      self.total_loss
    """
    all_sen_embs = self.thought_vectors
  
    if FLAGS.dropout:
      mask_shp = [1, self.config.encoder_dim]
      bin_mask = tf.random_uniform(mask_shp) > FLAGS.dropout_rate
      bin_mask = tf.where(bin_mask, tf.ones(mask_shp), tf.zeros(mask_shp))
      src = all_sen_embs[0] * bin_mask
      dst = all_sen_embs[1] * bin_mask
      scores = tf.matmul(src, dst, transpose_b=True)
    else:
      scores = tf.matmul(all_sen_embs[0], all_sen_embs[1], transpose_b=True)###study pre current post

    # Ignore source sentence
    scores = tf.matrix_set_diag(scores, np.zeros(FLAGS.batch_size))
    # Targets
    targets_np = np.zeros((FLAGS.batch_size, FLAGS.batch_size))
    ctxt_sent_pos = list(range(-FLAGS.context_size, FLAGS.context_size + 1))
    ctxt_sent_pos.remove(0)
    for ctxt_pos in ctxt_sent_pos:
      targets_np += np.eye(FLAGS.batch_size, k=ctxt_pos)
    targets_np_sum = np.sum(targets_np, axis=1, keepdims=True)
    targets_np = targets_np/targets_np_sum
    targets = tf.constant(targets_np, dtype=tf.float32)

    # Forward and backward scores    
    f_scores = scores[:-1]
    b_scores = scores[1:]

    losses = tf.nn.softmax_cross_entropy_with_logits(
        labels=targets, logits=scores)
  
    loss = tf.reduce_mean(losses)

    tf.summary.scalar("losses/ent_loss", loss)
    self.total_loss = loss

    if self.mode == "eval":
      f_max = tf.to_int64(tf.argmax(f_scores, axis=1))
      b_max = tf.to_int64(tf.argmax(b_scores, axis=1))

      targets = range(FLAGS.batch_size - 1)
      targets = tf.constant(list(targets), dtype=tf.int64)
      fwd_targets = targets + 1

      names_to_values, names_to_updates = tf.contrib.slim.metrics.aggregate_metric_map({
        "Acc/Fwd Acc": tf.contrib.slim.metrics.streaming_accuracy(f_max, fwd_targets), 
        "Acc/Bwd Acc": tf.contrib.slim.metrics.streaming_accuracy(b_max, targets)
      })

      for name, value in names_to_values.items():
        tf.summary.scalar(name, value)

      self.eval_op = names_to_updates.values()

The loss function is illustrated as follows:

 

 

Using tf.nn.softmax_cross_entropy_with_logits(labels=targets, logits=scores) to carry out cross-entropy, we can see from targets that quick_thoughtful idea is to deduce the similarity of target sentences according to context. Personally, I don't think that I have learned the characteristics of target sentences. I use the sentence vector trained by quick_thoughtful to classify the target sentences into different categories, and the effect is not very good (quick_thoughtful evaluation). Examples include the emotional classification of movies.

Specific paper reproduction code https://github.com/lajanugen/S2V

Modify https://github.com/jinjia/Quick_Thought (Chinese)

Posted by Xeon on Fri, 01 Mar 2019 20:00:23 -0800