前言

Lucene中Term代表了一个原子粒度的索引和搜索单位，ES中keyword和text类型经过分词之后都会以Term的形式被Lucene索引并落入磁盘持久化，与Term相关的索引文件有Term Dictionary（.tim），Term Index(.tip), 还有与之关联的倒排索引相关的文件(.doc, .pos, .pay)，本文重点关注在如何将内存中已经生成好的Term相关的数据结构落入磁盘的这一过程.

本文以Lucene tag releases/lucene/9.1.0 代码为准, commit: 5b522487ba8e0f1002b50a136817ca037aec9686, 只关注posting list中docId的写入，freq, position和payload 将会忽略

调用链路

function

相关源码分析

主要分析Lucene90BlockTreeTermsWriter的写入过程, 核心函数是TermsWriter.write

Lucene90BlockTreeTermsWriter.write -> TermsWriter.write

// Lucene90BlockTreeTermsWriter.TermsWriter.write
// TermsEnum 代表了这个Field关联的所有terms
public void write(BytesRef text, TermsEnum termsEnum, NormsProducer norms) throws IOException {

	// 首先写入倒排索引相关的问题, postingWriter是Lucene90PostingWriter的实例, 包括了docId和skip List相关的数据
	BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen, norms);
	if (state != null) {

        assert state.docFreq != 0;
        assert fieldInfo.getIndexOptions() == IndexOptions.DOCS
                || state.totalTermFreq >= state.docFreq
            : "postingsWriter=" + postingsWriter;
        pushTerm(text);

        PendingTerm term = new PendingTerm(text, state);
        pending.add(term);
        // if (DEBUG) System.out.println("    add pending term = " + text + " pending.size()=" +
        // pending.size());

        sumDocFreq += state.docFreq;
        sumTotalTermFreq += state.totalTermFreq;
        numTerms++;
        if (firstPendingTerm == null) {
          firstPendingTerm = term;
        }
        lastPendingTerm = term;
	}
}

PushPostingsWriterBase.writeTerm 抽象了writeTerm的整体流程，Lucene90PostingWriter extends PushPostingsWriterBase

 public final BlockTermState writeTerm(
      BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen, NormsProducer norms)
      throws IOException {
	// 本文不涉及
    NumericDocValues normValues;
    if (fieldInfo.hasNorms() == false) {
      normValues = null;
    } else {
      normValues = norms.getNorms(fieldInfo);
    }
	// 写入这个Term的一些初始化，包括lastDocId和skipList的初始化, 调用的实例方法是Lucene90PostingWriter.startTerm
    startTerm(normValues);
	// 从term中取出对应的posting list的iterator，本文主要关注posting中的docId
    postingsEnum = termsEnum.postings(postingsEnum, enumFlags);
    assert postingsEnum != null;

    int docFreq = 0;
    long totalTermFreq = 0;
    while (true) {
      int docID = postingsEnum.nextDoc();
      if (docID == PostingsEnum.NO_MORE_DOCS) {
        break;
      }
      docFreq++;
      docsSeen.set(docID);
      int freq;
      if (writeFreqs) {
        freq = postingsEnum.freq();
        totalTermFreq += freq;
      } else {
        freq = -1;
      }
	  // posting相关的核心函数, Lucene90PostingsWriter.startDoc
      startDoc(docID, freq);

      if (writePositions) {
        for (int i = 0; i < freq; i++) {
          int pos = postingsEnum.nextPosition();
          BytesRef payload = writePayloads ? postingsEnum.getPayload() : null;
          int startOffset;
          int endOffset;
          if (writeOffsets) {
            startOffset = postingsEnum.startOffset();
            endOffset = postingsEnum.endOffset();
          } else {
            startOffset = -1;
            endOffset = -1;
          }
          addPosition(pos, payload, startOffset, endOffset);
        }
      }

      finishDoc();
    }

    if (docFreq == 0) {
      return null;
    } else {
      BlockTermState state = newTermState();
      state.docFreq = docFreq;
      state.totalTermFreq = writeFreqs ? totalTermFreq : -1;
      finishTerm(state);
      return state;
    }
}

Lucene90PostingsWriter.startDoc

 public void startDoc(int docID, int termDocFreq) throws IOException {
// 为新的posting block创建(初始化)skipList 
   if (lastBlockDocID != -1 && docBufferUpto == 0) {
     skipWriter.bufferSkip(
         lastBlockDocID,
         competitiveFreqNormAccumulator,
         docCount,
         lastBlockPosFP,
         lastBlockPayFP,
         lastBlockPosBufferUpto,
         lastBlockPayloadByteUpto);
     competitiveFreqNormAccumulator.clear();
   }

// posting 存储的docId 都是差值，且要求所有docId都是严格升序排列，隐含条件docId不会重复(相同Term在posting中只会出现一次)
   final int docDelta = docID - lastDocID;

   if (docID < 0 || (docCount > 0 && docDelta <= 0)) {
     throw new CorruptIndexException(
         "docs out of order (" + docID + " <= " + lastDocID + " )", docOut);
   }

   docDeltaBuffer[docBufferUpto] = docDelta;
   if (writeFreqs) {
     freqBuffer[docBufferUpto] = termDocFreq;
   }
// block内docCount的计数
   docBufferUpto++;
// 整体的docCount 计数
   docCount++;

// 以128的BLOCK_SIZE写入docOut(.doc)文件中
   if (docBufferUpto == BLOCK_SIZE) {
     pforUtil.encode(docDeltaBuffer, docOut);
     if (writeFreqs) {
       pforUtil.encode(freqBuffer, docOut);
     }
     // NOTE: don't set docBufferUpto back to 0 here;
     // finishDoc will do so (because it needs to see that
     // the block was filled so it can save skip data)
   }

// 更新lastDocID
   lastDocID = docID;
   lastPosition = 0;
   lastStartOffset = 0;

   long norm;
   if (fieldHasNorms) {
     boolean found = norms.advanceExact(docID);
     if (found == false) {
       // This can happen if indexing hits a problem after adding a doc to the
       // postings but before buffering the norm. Such documents are written
       // deleted and will go away on the first merge.
       norm = 1L;
     } else {
       norm = norms.longValue();
       assert norm != 0 : docID;
     }
   } else {
     norm = 1L;
   }

   competitiveFreqNormAccumulator.add(writeFreqs ? termDocFreq : 1, norm);
 }

Lucene90PostingsWriter.finishDoc

public void finishDoc() throws IOException {
   // Since we don't know df for current term, we had to buffer
   // those skip data for each block, and when a new doc comes,
   // write them to skip file.
   if (docBufferUpto == BLOCK_SIZE) {
  // skip data是针对block的，block对应的docId 是该block中最大的docId
     lastBlockDocID = lastDocID;
     if (posOut != null) {
       if (payOut != null) {
         lastBlockPayFP = payOut.getFilePointer();
       }
       lastBlockPosFP = posOut.getFilePointer();
       lastBlockPosBufferUpto = posBufferUpto;
       lastBlockPayloadByteUpto = payloadByteUpto;
     }
     docBufferUpto = 0;
   }
 }

Lucene90PostingsWriter.finishTerm

 public void finishTerm(BlockTermState _state) throws IOException {
    IntBlockTermState state = (IntBlockTermState) _state;
    assert state.docFreq > 0;

    // TODO: wasteful we are counting this (counting # docs
    // for this term) in two places?
    assert state.docFreq == docCount : state.docFreq + " vs " + docCount;

    // docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to
    // it.
    final int singletonDocID;
    if (state.docFreq == 1) {
      // pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq
      singletonDocID = (int) docDeltaBuffer[0];
    } else {
      singletonDocID = -1;
      // vInt encode the remaining doc deltas and freqs:
      for (int i = 0; i < docBufferUpto; i++) {
        final int docDelta = (int) docDeltaBuffer[i];
        final int freq = (int) freqBuffer[i];
        if (!writeFreqs) {
		  // 将最后一个未满BLOCK_SIZE的block的docDelta 写入到doc文件中
          docOut.writeVInt(docDelta);
        } else if (freq == 1) {
          docOut.writeVInt((docDelta << 1) | 1);
        } else {
          docOut.writeVInt(docDelta << 1);
          docOut.writeVInt(freq);
        }
      }
    }

    final long lastPosBlockOffset;

    if (writePositions) {
      // totalTermFreq is just total number of positions(or payloads, or offsets)
      // associated with current term.
      assert state.totalTermFreq != -1;
      if (state.totalTermFreq > BLOCK_SIZE) {
        // record file offset for last pos in last block
        lastPosBlockOffset = posOut.getFilePointer() - posStartFP;
      } else {
        lastPosBlockOffset = -1;
      }
      if (posBufferUpto > 0) {
        // TODO: should we send offsets/payloads to
        // .pay...?  seems wasteful (have to store extra
        // vLong for low (< BLOCK_SIZE) DF terms = vast vast
        // majority)

        // vInt encode the remaining positions/payloads/offsets:
        int lastPayloadLength = -1; // force first payload length to be written
        int lastOffsetLength = -1; // force first offset length to be written
        int payloadBytesReadUpto = 0;
        for (int i = 0; i < posBufferUpto; i++) {
          final int posDelta = (int) posDeltaBuffer[i];
          if (writePayloads) {
            final int payloadLength = (int) payloadLengthBuffer[i];
            if (payloadLength != lastPayloadLength) {
              lastPayloadLength = payloadLength;
              posOut.writeVInt((posDelta << 1) | 1);
              posOut.writeVInt(payloadLength);
            } else {
              posOut.writeVInt(posDelta << 1);
            }

            if (payloadLength != 0) {
              posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength);
              payloadBytesReadUpto += payloadLength;
            }
          } else {
            posOut.writeVInt(posDelta);
          }

          if (writeOffsets) {
            int delta = (int) offsetStartDeltaBuffer[i];
            int length = (int) offsetLengthBuffer[i];
            if (length == lastOffsetLength) {
              posOut.writeVInt(delta << 1);
            } else {
              posOut.writeVInt(delta << 1 | 1);
              posOut.writeVInt(length);
              lastOffsetLength = length;
            }
          }
        }

        if (writePayloads) {
          assert payloadBytesReadUpto == payloadByteUpto;
          payloadByteUpto = 0;
        }
      }
    } else {
      lastPosBlockOffset = -1;
    }

    long skipOffset;
	// 写入skipList的data, 可以看到skipList的数据是写完posting之后再写入的
    if (docCount > BLOCK_SIZE) {
      skipOffset = skipWriter.writeSkip(docOut) - docStartFP;
    } else {
      skipOffset = -1;
    }

    state.docStartFP = docStartFP;
    state.posStartFP = posStartFP;
    state.payStartFP = payStartFP;
    state.singletonDocID = singletonDocID;
    state.skipOffset = skipOffset;
    state.lastPosBlockOffset = lastPosBlockOffset;
    docBufferUpto = 0;
    posBufferUpto = 0;
    lastDocID = 0;
    docCount = 0;
}

Lucene90BlockTreeTermsWriter.TermsWriter.pushTerm

 private void pushTerm(BytesRef text) throws IOException {
      // Find common prefix between last term and current term:
      int prefixLength =
          Arrays.mismatch(
              lastTerm.bytes(),
              0,
              lastTerm.length(),
              text.bytes,
              text.offset,
              text.offset + text.length);
      if (prefixLength == -1) { // Only happens for the first term, if it is empty
        assert lastTerm.length() == 0;
        prefixLength = 0;
      }

      // if (DEBUG) System.out.println("  shared=" + pos + "  lastTerm.length=" + lastTerm.length);

      // Close the "abandoned" suffix now:
      for (int i = lastTerm.length() - 1; i >= prefixLength; i--) {

        // How many items on top of the stack share the current suffix
        // we are closing:
        int prefixTopSize = pending.size() - prefixStarts[i];
        if (prefixTopSize >= minItemsInBlock) {
          // if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + "
          // minItemsInBlock=" + minItemsInBlock);
		  // 重点： 构建FST 相关的索引
          writeBlocks(i + 1, prefixTopSize);
          prefixStarts[i] -= prefixTopSize - 1;
        }
      }

      if (prefixStarts.length < text.length) {
        prefixStarts = ArrayUtil.grow(prefixStarts, text.length);
      }

      // Init new tail:
      for (int i = prefixLength; i < text.length; i++) {
        prefixStarts[i] = pending.size();
      }

      lastTerm.copyBytes(text);
}

Lucene90BlockTreeTermsWriter.TermsWriter.writeBlocks

void writeBlocks(int prefixLength, int count) throws IOException {

      assert count > 0;

      // if (DEBUG2) {
      //  BytesRef br = new BytesRef(lastTerm.bytes());
      //  br.length = prefixLength;
      //  System.out.println("writeBlocks: seg=" + segment + " prefix=" + brToString(br) + " count="
      // + count);
      // }

      // Root block better write all remaining pending entries:
      assert prefixLength > 0 || count == pending.size();

      int lastSuffixLeadLabel = -1;

      // True if we saw at least one term in this block (we record if a block
      // only points to sub-blocks in the terms index so we can avoid seeking
      // to it when we are looking for a term):
      boolean hasTerms = false;
      boolean hasSubBlocks = false;

      int start = pending.size() - count;
      int end = pending.size();
      int nextBlockStart = start;
      int nextFloorLeadLabel = -1;

      for (int i = start; i < end; i++) {

        PendingEntry ent = pending.get(i);

        int suffixLeadLabel;

        if (ent.isTerm) {
          PendingTerm term = (PendingTerm) ent;
          if (term.termBytes.length == prefixLength) {
            // Suffix is 0, i.e. prefix 'foo' and term is
            // 'foo' so the term has empty string suffix
            // in this block
            assert lastSuffixLeadLabel == -1
                : "i=" + i + " lastSuffixLeadLabel=" + lastSuffixLeadLabel;
            suffixLeadLabel = -1;
          } else {
            suffixLeadLabel = term.termBytes[prefixLength] & 0xff;
          }
        } else {
          PendingBlock block = (PendingBlock) ent;
          assert block.prefix.length > prefixLength;
          suffixLeadLabel = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff;
        }
        // if (DEBUG) System.out.println("  i=" + i + " ent=" + ent + " suffixLeadLabel=" +
        // suffixLeadLabel);

        if (suffixLeadLabel != lastSuffixLeadLabel) {
          int itemsInBlock = i - nextBlockStart;
          if (itemsInBlock >= minItemsInBlock && end - nextBlockStart > maxItemsInBlock) {
            // The count is too large for one block, so we must break it into "floor" blocks, where
            // we record
            // the leading label of the suffix of the first term in each floor block, so at search
            // time we can
            // jump to the right floor block.  We just use a naive greedy segmenter here: make a new
            // floor
            // block as soon as we have at least minItemsInBlock.  This is not always best: it often
            // produces
            // a too-small block as the final block:
            boolean isFloor = itemsInBlock < count;
            newBlocks.add(
                writeBlock(
                    prefixLength,
                    isFloor,
                    nextFloorLeadLabel,
                    nextBlockStart,
                    i,
                    hasTerms,
                    hasSubBlocks));

            hasTerms = false;
            hasSubBlocks = false;
            nextFloorLeadLabel = suffixLeadLabel;
            nextBlockStart = i;
          }

          lastSuffixLeadLabel = suffixLeadLabel;
        }

        if (ent.isTerm) {
          hasTerms = true;
        } else {
          hasSubBlocks = true;
        }
      }

      // Write last block, if any:
      if (nextBlockStart < end) {
        int itemsInBlock = end - nextBlockStart;
        boolean isFloor = itemsInBlock < count;
        newBlocks.add(
            writeBlock(
                prefixLength,
                isFloor,
                nextFloorLeadLabel,
                nextBlockStart,
                end,
                hasTerms,
                hasSubBlocks));
      }

      assert newBlocks.isEmpty() == false;

      PendingBlock firstBlock = newBlocks.get(0);

      assert firstBlock.isFloor || newBlocks.size() == 1;

	  // NOTE: 每个block生成fst的index
      firstBlock.compileIndex(newBlocks, scratchBytes, scratchIntsRef);

      // Remove slice from the top of the pending stack, that we just wrote:
      pending.subList(pending.size() - count, pending.size()).clear();

      // Append new block
      pending.add(firstBlock);

      newBlocks.clear();
}

Lucene90BlockTreeTermsWriter.TermsWriter.writeBlock
PendingBlock中记录了对应的在.tim文件中的位置(startFP), 后续会被FST索引到对应的value里

private PendingBlock writeBlock(
        int prefixLength,
        boolean isFloor,
        int floorLeadLabel,
        int start,
        int end,
        boolean hasTerms,
        boolean hasSubBlocks)
        throws IOException {
	// ....
	// 
	if (isLeafBlock) {
		subIndices = null;
        StatsWriter statsWriter =
            new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS);
        for (int i = start; i < end; i++) {
          PendingEntry ent = pending.get(i);
          assert ent.isTerm : "i=" + i;

          PendingTerm term = (PendingTerm) ent;

          assert StringHelper.startsWith(term.termBytes, prefix) : term + " prefix=" + prefix;
          BlockTermState state = term.state;
          final int suffix = term.termBytes.length - prefixLength;
          // if (DEBUG2) {
          //  BytesRef suffixBytes = new BytesRef(suffix);
          //  System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix);
          //  suffixBytes.length = suffix;
          //  System.out.println("    write term suffix=" + brToString(suffixBytes));
          // }

          // For leaf block we write suffix straight
          suffixLengthsWriter.writeVInt(suffix);
          suffixWriter.append(term.termBytes, prefixLength, suffix);
          assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;

          // Write term stats, to separate byte[] blob:
          statsWriter.add(state.docFreq, state.totalTermFreq);

          // Write term meta data
          postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute);
          absolute = false;
		}

	} else {
		 // Block has at least one prefix term or a sub block:
        subIndices = new ArrayList<>();
        StatsWriter statsWriter =
            new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS);
        for (int i = start; i < end; i++) {
          PendingEntry ent = pending.get(i);
          if (ent.isTerm) {
            PendingTerm term = (PendingTerm) ent;

            assert StringHelper.startsWith(term.termBytes, prefix) : term + " prefix=" + prefix;
            BlockTermState state = term.state;
            final int suffix = term.termBytes.length - prefixLength;

            // For non-leaf block we borrow 1 bit to record
            // if entry is term or sub-block, and 1 bit to record if
            // it's a prefix term.  Terms cannot be larger than ~32 KB
            // so we won't run out of bits:

            suffixLengthsWriter.writeVInt(suffix << 1);
            suffixWriter.append(term.termBytes, prefixLength, suffix);

            // Write term stats, to separate byte[] blob:
            statsWriter.add(state.docFreq, state.totalTermFreq);

            // TODO: now that terms dict "sees" these longs,
            // we can explore better column-stride encodings
            // to encode all long[0]s for this block at
            // once, all long[1]s, etc., e.g. using
            // Simple64.  Alternatively, we could interleave
            // stats + meta ... no reason to have them
            // separate anymore:

            // Write term meta data
            postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute);
            absolute = false;
          } else {
            PendingBlock block = (PendingBlock) ent;
            assert StringHelper.startsWith(block.prefix, prefix);
            final int suffix = block.prefix.length - prefixLength;
            assert StringHelper.startsWith(block.prefix, prefix);

            assert suffix > 0;

            // For non-leaf block we borrow 1 bit to record
            // if entry is term or sub-block:f
            suffixLengthsWriter.writeVInt((suffix << 1) | 1);
            suffixWriter.append(block.prefix.bytes, prefixLength, suffix);

            // if (DEBUG2) {
            //  BytesRef suffixBytes = new BytesRef(suffix);
            //  System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
            //  suffixBytes.length = suffix;
            //  System.out.println("      write sub-block suffix=" + brToString(suffixBytes) + "
            // subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
            // }

            assert floorLeadLabel == -1
                    || (block.prefix.bytes[prefixLength] & 0xff) >= floorLeadLabel
                : "floorLeadLabel="
                    + floorLeadLabel
                    + " suffixLead="
                    + (block.prefix.bytes[prefixLength] & 0xff);
            assert block.fp < startFP;

            suffixLengthsWriter.writeVLong(startFP - block.fp);
            subIndices.add(block.index);
          }
        }
        statsWriter.finish();

        assert subIndices.size() != 0;
	}
	// ...
	// Write term meta data byte[] blob
	termsOut.writeVInt((int) metaWriter.size());
	// 写入到`tim`文件
	metaWriter.copyTo(termsOut);
	metaWriter.reset();
}

Lucene90PostingWriter.encodeTerm

// 这里的DataOutput 指的是ByteBuffersDataOutput, 一个内存结构, 这里记录了每个Term 倒排索引对应的文件位置相关, 最终会被写入`.tim`文件
public void encodeTerm(
      DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute)
      throws IOException {
    IntBlockTermState state = (IntBlockTermState) _state;
    if (absolute) {
      lastState = emptyState;
      assert lastState.docStartFP == 0;
    }

    if (lastState.singletonDocID != -1
        && state.singletonDocID != -1
        && state.docStartFP == lastState.docStartFP) {
      // With runs of rare values such as ID fields, the increment of pointers in the docs file is
      // often 0.
      // Furthermore some ID schemes like auto-increment IDs or Flake IDs are monotonic, so we
      // encode the delta
      // between consecutive doc IDs to save space.
      final long delta = (long) state.singletonDocID - lastState.singletonDocID;
      out.writeVLong((BitUtil.zigZagEncode(delta) << 1) | 0x01);
    } else {
      out.writeVLong((state.docStartFP - lastState.docStartFP) << 1);
      if (state.singletonDocID != -1) {
        out.writeVInt(state.singletonDocID);
      }
    }

    if (writePositions) {
      out.writeVLong(state.posStartFP - lastState.posStartFP);
      if (writePayloads || writeOffsets) {
        out.writeVLong(state.payStartFP - lastState.payStartFP);
      }
    }
    if (writePositions) {
      if (state.lastPosBlockOffset != -1) {
        out.writeVLong(state.lastPosBlockOffset);
      }
    }
    if (state.skipOffset != -1) {
      out.writeVLong(state.skipOffset);
    }
    lastState = state;
  }

Lucene90BlockTreeTermsWriter.TermsWriter.finish

public void finish() throws IOException {
     if (numTerms > 0) {
       // if (DEBUG) System.out.println("BTTW: finish prefixStarts=" +
       // Arrays.toString(prefixStarts));

       // Add empty term to force closing of all final blocks:
       pushTerm(new BytesRef());

       // TODO: if pending.size() is already 1 with a non-zero prefix length
       // we can save writing a "degenerate" root block, but we have to
       // fix all the places that assume the root block's prefix is the empty string:
       pushTerm(new BytesRef());
	// 这里完成将所有Term生成FST的索引, pending.get(0)是root的block，其索引包含了整个pending的Terms
       writeBlocks(0, pending.size());

       // We better have one final "root" block:
       assert pending.size() == 1 && !pending.get(0).isTerm
           : "pending.size()=" + pending.size() + " pending=" + pending;
       final PendingBlock root = (PendingBlock) pending.get(0);
       assert root.prefix.length == 0;
       final BytesRef rootCode = root.index.getEmptyOutput();
       assert rootCode != null;

       ByteBuffersDataOutput metaOut = new ByteBuffersDataOutput();
       fields.add(metaOut);

       metaOut.writeVInt(fieldInfo.number);
       metaOut.writeVLong(numTerms);
       metaOut.writeVInt(rootCode.length);
       metaOut.writeBytes(rootCode.bytes, rootCode.offset, rootCode.length);
       assert fieldInfo.getIndexOptions() != IndexOptions.NONE;
       if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
         metaOut.writeVLong(sumTotalTermFreq);
       }
       metaOut.writeVLong(sumDocFreq);
       metaOut.writeVInt(docsSeen.cardinality());
       writeBytesRef(metaOut, new BytesRef(firstPendingTerm.termBytes));
       writeBytesRef(metaOut, new BytesRef(lastPendingTerm.termBytes));
       metaOut.writeVLong(indexOut.getFilePointer());
       // Write FST to index
	// 这里的indexOut就是".tip"
       root.index.save(metaOut, indexOut);
       // System.out.println("  write FST " + indexStartFP + " field=" + fieldInfo.name);

       /*
       if (DEBUG) {
         final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
         Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
         Util.toDot(root.index, w, false, false);
         System.out.println("SAVED to " + dotFileName);
         w.close();
       }
       */

     } else {
       assert sumTotalTermFreq == 0
           || fieldInfo.getIndexOptions() == IndexOptions.DOCS && sumTotalTermFreq == -1;
       assert sumDocFreq == 0;
       assert docsSeen.cardinality() == 0;
     }
   }

总结

Lucene整体倒排索引设计还是非常复杂的，这里仅仅是将内存的数据写入到磁盘就需要这么多步骤，同时Lucene保持了很好的抽象，不同格式的索引数据只要涉及实现对应的接口即可，通过codec的方式嵌入到引擎中.

我的问题是如何评估这些设计对性能的影响，FST的涉及需要如此复杂吗，只是为了定位Term对应的倒排表位置，有没有其他可以使用的数据结构呢, 在后续的学习过程中还是要多加深入了解和思考，希望能得到更好的答案.

skyitachi's blogLucene中Term相关的倒排文件是如何生成的 2024-08-04

前言

调用链路

相关源码分析

总结